diff --git a/script/import_scripts/base/generic_database.rb b/script/import_scripts/base/generic_database.rb index dafea94199e..f4062695e7a 100644 --- a/script/import_scripts/base/generic_database.rb +++ b/script/import_scripts/base/generic_database.rb @@ -14,10 +14,13 @@ module ImportScripts configure_database create_category_table + create_upload_table create_like_table create_user_table create_topic_table create_post_table + create_pm_topic_table + create_pm_post_table end def insert_category(category) @@ -27,6 +30,15 @@ module ImportScripts SQL end + def insert_upload(upload) + @db.execute(<<-SQL, prepare(upload)) + INSERT OR REPLACE INTO upload (id, user_id, original_filename, + filename, description, url) + VALUES (:id, :user_id, :original_filename, + :filename, :description, :url) + SQL + end + def insert_user(user) @db.execute(<<-SQL, prepare(user)) INSERT OR REPLACE @@ -69,6 +81,16 @@ module ImportScripts end end + def insert_pm_topic(topic) + attachments = topic.delete(:attachments) + topic[:upload_count] = attachments&.size || 0 + + @db.execute(<<-SQL, prepare(topic)) + INSERT OR REPLACE INTO pm_topic (id, title, raw, category_id, closed, user_id, created_at, url, upload_count, target_users) + VALUES (:id, :title, :raw, :category_id, :closed, :user_id, :created_at, :url, :upload_count, :target_users) + SQL + end + def insert_post(post) like_user_ids = post.delete(:like_user_ids) attachments = post.delete(:attachments) @@ -92,6 +114,16 @@ module ImportScripts end end + def insert_pm_post(post) + attachments = post.delete(:attachments) + post[:upload_count] = attachments&.size || 0 + + @db.execute(<<-SQL, prepare(post)) + INSERT OR REPLACE INTO pm_post (id, raw, topic_id, user_id, created_at, reply_to_post_id, url, upload_count) + VALUES (:id, :raw, :topic_id, :user_id, :created_at, :reply_to_post_id, :url, :upload_count) + SQL + end + def sort_posts_by_created_at @db.execute "DELETE FROM post_order" @@ -160,6 +192,13 @@ module ImportScripts SQL end + def count_pm_topics + @db.get_first_value(<<-SQL) + SELECT COUNT(*) + FROM pm_topic + SQL + end + def fetch_topics(last_id) rows = @db.execute(<<-SQL, last_id) SELECT * @@ -172,6 +211,18 @@ module ImportScripts add_last_column_value(rows, "id") end + def fetch_pm_topics(last_id) + rows = @db.execute(<<-SQL, last_id) + SELECT * + FROM pm_topic + WHERE id > :last_id + ORDER BY id + LIMIT #{@batch_size} + SQL + + add_last_column_value(rows, "id") + end + def fetch_topic_attachments(topic_id) @db.execute(<<-SQL, topic_id) SELECT path @@ -187,6 +238,21 @@ module ImportScripts SQL end + def count_pm_posts + @db.get_first_value(<<-SQL) + SELECT COUNT(*) + FROM pm_post + SQL + end + + def fetch_upload(id) + @db.execute(<<-SQL, id) + SELECT * + FROM upload + WHERE id = :id + SQL + end + def fetch_posts(last_row_id) rows = @db.execute(<<-SQL, last_row_id) SELECT ROWID AS rowid, * @@ -199,6 +265,18 @@ module ImportScripts add_last_column_value(rows, "rowid") end + def fetch_pm_posts(last_row_id) + rows = @db.execute(<<-SQL, last_row_id) + SELECT ROWID AS rowid, * + FROM pm_post + WHERE ROWID > :last_row_id + ORDER BY ROWID + LIMIT #{@batch_size} + SQL + + add_last_column_value(rows, "rowid") + end + def fetch_sorted_posts(last_row_id) rows = @db.execute(<<-SQL, last_row_id) SELECT o.ROWID AS rowid, p.* @@ -270,6 +348,19 @@ module ImportScripts SQL end + def create_upload_table + @db.execute <<-SQL + CREATE TABLE IF NOT EXISTS upload ( + id #{key_data_type} NOT NULL PRIMARY KEY, + user_id INTEGER, + original_filename TEXT, + filename TEXT, + description TEXT, + url TEXT + ) + SQL + end + def create_like_table @db.execute <<-SQL CREATE TABLE IF NOT EXISTS like ( @@ -325,6 +416,23 @@ module ImportScripts @db.execute "CREATE UNIQUE INDEX IF NOT EXISTS topic_upload_unique ON topic_upload(topic_id, path)" end + def create_pm_topic_table + @db.execute <<-SQL + CREATE TABLE IF NOT EXISTS pm_topic ( + id #{key_data_type} NOT NULL PRIMARY KEY, + title TEXT, + raw TEXT, + category_id #{key_data_type}, + closed BOOLEAN NOT NULL DEFAULT false, + user_id #{key_data_type} NOT NULL, + target_users TEXT, + created_at DATETIME, + url TEXT, + upload_count INTEGER DEFAULT 0 + ) + SQL + end + def create_post_table @db.execute <<-SQL CREATE TABLE IF NOT EXISTS post ( @@ -357,6 +465,21 @@ module ImportScripts @db.execute "CREATE UNIQUE INDEX IF NOT EXISTS post_upload_unique ON post_upload(post_id, path)" end + def create_pm_post_table + @db.execute <<-SQL + CREATE TABLE IF NOT EXISTS pm_post ( + id #{key_data_type} NOT NULL PRIMARY KEY, + raw TEXT, + topic_id #{key_data_type} NOT NULL, + user_id #{key_data_type} NOT NULL, + created_at DATETIME, + reply_to_post_id #{key_data_type}, + url TEXT, + upload_count INTEGER DEFAULT 0 + ) + SQL + end + def prepare(hash) hash.each do |key, value| if value.is_a?(TrueClass) || value.is_a?(FalseClass) diff --git a/script/import_scripts/yammer.rb b/script/import_scripts/yammer.rb new file mode 100644 index 00000000000..907ca4a6be8 --- /dev/null +++ b/script/import_scripts/yammer.rb @@ -0,0 +1,505 @@ +# frozen_string_literal: true + +# Yammer importer +# https://docs.microsoft.com/en-us/yammer/manage-security-and-compliance/export-yammer-enterprise-data#export-yammer-network-data-by-date-range-and-network +# +# You will need a bunch of CSV files: +# +# - Users.csv Groups.csv Topics.csv Groups.csv Files.csv Messages.csv +# (Others included in Yammer export are ignored) + +require "csv" +require_relative "base" +require_relative "base/generic_database" + +# Call it like this: +# RAILS_ENV=production bundle exec ruby script/import_scripts/yammer.rb DIRNAME + +class ImportScripts::Yammer < ImportScripts::Base + BATCH_SIZE = 1000 + NUM_WORDS_IN_TITLE = ENV["NUM_WORDS_IN_TITLE"].to_i || 20 + SKIP_EMPTY_EMAIL = true + SKIP_INACTIVE_USERS = false + PARENT_CATEGORY_NAME = ENV["PARENT_CATEGORY_NAME"] || "Yammer Import" + IMPORT_GROUPS_AS_TAGS = true + MERGE_USERS = true + # import groups as tags rather than as categories + SiteSetting.tagging_enabled = true if IMPORT_GROUPS_AS_TAGS + PM_TAG = ENV["PM_TAG"] || "eht" + + def initialize(path) + super() + + @path = path + @db = ImportScripts::GenericDatabase.new(@path, batch_size: BATCH_SIZE, recreate: true) + end + + def execute + create_developer_users + read_csv_files + + import_categories + import_users + import_topics + import_pm_topics + import_posts + import_pm_posts + end + + def create_developer_users + GlobalSetting + .developer_emails + .split(",") + .each { |e| User.create(email: e, active: true, username: e.split("@")[0]) } + end + + def read_csv_files + puts "", "reading CSV files" + + # consider csv_parse Tags.csv + # consider Admins.csv that has admins + + u_count = 0 + csv_parse("Users") do |row| + next if SKIP_INACTIVE_USERS && row[:state] != "active" + u_count += 1 + @db.insert_user( + id: row[:id], + email: row[:email], + name: row[:name], + username: row[:name], + bio: "", + # job_title: row[:job_title], + # location: row[:location], + # department: row[:department], + created_at: parse_datetime(row[:joined_at]), + # deleted_at: parse_datetime(row[:deleted_at]), + # suspended_at: parse_datetime(row[:suspended_at]), + # guid: row[:guid], + # state: row[:state], + avatar_path: row[:user_cover_image_url], + # last_seen_at: , + active: row[:state] == "active" ? 1 : 0, + ) + end + + category_position = 0 + csv_parse("Groups") do |row| + @db.insert_category( + id: row[:id], + name: row[:name], + description: row[:description], + position: category_position += 1, + ) + end + + csv_parse("Files") do |row| + @db.insert_upload( + id: row[:file_id], + user_id: row[:uploader_id], + original_filename: row[:name], + filename: row[:path], + description: row[:description], + ) + end + + # get topics from messages + csv_parse("Messages") do |row| + next unless row[:thread_id] == row[:id] + next if row[:in_private_conversation] == "true" + next unless row[:deleted_at].blank? + # next if row[:message_type] == 'system' + title = "" + url = "" + description = "" + raw = row[:body] + reg = /opengraphobject:\[(\d*?) : (.*?) : title="(.*?)" : description="(.*?)"\]/ + if row[:attachments] + row[:attachments].match(reg) do + url = Regexp.last_match(2) + title = Regexp.last_match(3) if Regexp.last_match(3) + description = Regexp.last_match(4) + raw += "\n***\n#{description}\n#{url}\n" unless raw.include?(url) + end + row[:attachments].match(/uploadedfile:(\d*)$/) do + file_id = Regexp.last_match(1).to_i + up = @db.fetch_upload(file_id).first + path = File.join(@path, up["filename"]) + filename = up["original_filename"] + user_id = user_id_from_imported_user_id(row["user_id"]) || Discourse.system_user.id + if File.exist?(path) + upload = create_upload(user_id, path, filename) + raw += html_for_upload(upload, filename) if upload&.persisted? + end + end + end + @db.insert_topic( + id: row[:id], + title: title, + raw: raw, + category_id: row[:group_id], + closed: row[:closed] == "TRUE" ? 1 : 0, + user_id: row[:sender_id], + created_at: parse_datetime(row[:created_at]), + ) + end + + # get pm topics + csv_parse("Messages") do |row| + next unless row[:thread_id] == row[:id] + next unless row[:in_private_conversation] == "true" + next unless row[:deleted_at].blank? + # next if row[:message_type] == 'system' + title = "" + url = "" + description = "" + raw = row[:body] + reg = /opengraphobject:\[(\d*?) : (.*?) : title="(.*?)" : description="(.*?)"\]/ + if row[:attachments] + row[:attachments].match(reg) do + url = Regexp.last_match(2) + title = Regexp.last_match(3) if Regexp.last_match(3) + description = Regexp.last_match(4) + raw += "\n***\n#{description}\n#{url}\n" unless raw.include?(url) + end + row[:attachments].match(/uploadedfile:(\d*)$/) do + file_id = Regexp.last_match(1).to_i + up = @db.fetch_upload(file_id).first + path = File.join(@path, up["filename"]) + filename = up["original_filename"] + user_id = user_id_from_imported_user_id(row["user_id"]) || Discourse.system_user.id + if File.exist?(path) + upload = create_upload(user_id, path, filename) + raw += html_for_upload(upload, filename) if upload&.persisted? + end + end + end + @db.insert_pm_topic( + id: row[:id], + title: title, + raw: raw, + category_id: row[:group_id], + closed: row[:closed] == "TRUE" ? 1 : 0, + target_users: row[:participants].gsub("user:", ""), + user_id: row[:sender_id], + created_at: parse_datetime(row[:created_at]), + ) + end + + # get posts from messages + csv_parse("Messages") do |row| + next if row[:thread_id] == row[:id] + next unless row[:deleted_at].blank? + next if row[:in_private_conversation] == "true" + @db.insert_post( + id: row[:id], + raw: row[:body] + "\n" + row[:attachments], + topic_id: row[:thread_id], + reply_to_post_id: row[:replied_to_id], + user_id: row[:sender_id], + created_at: parse_datetime(row[:created_at]), + ) + end + + # get pm posts from messages + csv_parse("Messages") do |row| + next if row[:thread_id] == row[:id] + next unless row[:deleted_at].blank? + next unless row[:in_private_conversation] == "false" + @db.insert_pm_post( + id: row[:id], + raw: row[:body] + "\n" + row[:attachments], + topic_id: row[:thread_id], + reply_to_post_id: row[:replied_to_id], + user_id: row[:sender_id], + created_at: parse_datetime(row[:created_at]), + ) + end + + #@db.delete_unused_users + @db.sort_posts_by_created_at + end + + def parse_datetime(text) + return nil if text.blank? || text == "null" + DateTime.parse(text) + end + + def import_categories + puts "", "creating categories" + parent_category = nil + if !PARENT_CATEGORY_NAME.blank? + parent_category = Category.find_by(name: PARENT_CATEGORY_NAME) + parent_category = + Category.create( + name: PARENT_CATEGORY_NAME, + user_id: Discourse.system_user.id, + ) unless parent_category + end + + if IMPORT_GROUPS_AS_TAGS + @tag_map = {} + rows = @db.fetch_categories + rows.each { |row| @tag_map[row["id"]] = row["name"] } + else + rows = @db.fetch_categories + create_categories(rows) do |row| + { + id: row["id"], + name: row["name"], + description: row["description"], + position: row["position"], + parent_category_id: parent_category, + } + end + end + end + + def batches + super(BATCH_SIZE) + end + + def import_users + puts "", "creating users" + total_count = @db.count_users + puts "", "Got #{total_count} users!" + last_id = "" + + batches do |offset| + rows, last_id = @db.fetch_users(last_id) + break if rows.empty? + + next if all_records_exist?(:users, rows.map { |row| row["id"] }) + + create_users(rows, total: total_count, offset: offset) do |row| + user = User.find_by_email(row["email"].downcase) + if user + user.custom_fields["import_id"] = row["id"] + user.custom_fields["matched_existing"] = "yes" + user.save + add_user(row["id"].to_s, user) + next + end + { + id: row["id"], + email: row["email"], + name: row["name"], + created_at: row["created_at"], + last_seen_at: row["last_seen_at"], + active: row["active"] == 1, + } + end + end + end + + def import_topics + puts "", "creating topics" + staff_guardian = Guardian.new(Discourse.system_user) + + total_count = @db.count_topics + last_id = "" + + batches do |offset| + rows, last_id = @db.fetch_topics(last_id) + base_category = Category.find_by(name: PARENT_CATEGORY_NAME) + break if rows.empty? + + next if all_records_exist?(:posts, rows.map { |row| import_topic_id(row["id"]) }) + + create_posts(rows, total: total_count, offset: offset) do |row| + { + id: import_topic_id(row["id"]), + title: + ( + if row["title"].present? + row["title"] + else + row["raw"].split(/\W/)[0..(NUM_WORDS_IN_TITLE - 1)].join(" ") + end + ), + raw: normalize_raw(row["raw"]), + category: + ( + if IMPORT_GROUPS_AS_TAGS + base_category.id + else + category_id_from_imported_category_id(row["category_id"]) + end + ), + user_id: user_id_from_imported_user_id(row["user_id"]) || Discourse.system_user.id, + created_at: row["created_at"], + closed: row["closed"] == 1, + post_create_action: + proc do |post| + if IMPORT_GROUPS_AS_TAGS + topic = Topic.find(post.topic_id) + tag_names = [@tag_map[row["category_id"]]] + DiscourseTagging.tag_topic_by_names(topic, staff_guardian, tag_names) + end + end, + } + end + end + end + + def import_pm_topics + puts "", "creating pm topics" + staff_guardian = Guardian.new(Discourse.system_user) + + total_count = @db.count_pm_topics + last_id = "" + + batches do |offset| + rows, last_id = @db.fetch_pm_topics(last_id) + base_category = Category.find_by(name: PARENT_CATEGORY_NAME) + break if rows.empty? + + next if all_records_exist?(:posts, rows.map { |row| import_topic_id(row["id"]) }) + create_posts(rows, total: total_count, offset: offset) do |row| + target_users = [] + row["target_users"] + .split(",") + .each do |u| + user_id = user_id_from_imported_user_id(u) + next unless user_id + user = User.find(user_id) + target_users.append(user.username) + end + target_usernames = target_users.join(",") + { + id: import_topic_id(row["id"]), + title: + ( + if row["title"].present? + row["title"] + else + row["raw"].split(/\W/)[0..(NUM_WORDS_IN_TITLE - 1)].join(" ") + end + ), + raw: normalize_raw(row["raw"]), + category: + ( + if IMPORT_GROUPS_AS_TAGS + base_category.id + else + category_id_from_imported_category_id(row["category_id"]) + end + ), + user_id: user_id_from_imported_user_id(row["user_id"]) || Discourse.system_user.id, + created_at: row["created_at"], + closed: row["closed"] == 1, + archetype: Archetype.private_message, + target_usernames: target_usernames, + post_create_action: + proc do |post| + if PM_TAG + topic = Topic.find(post.topic_id) + tag_names = [PM_TAG] + DiscourseTagging.tag_topic_by_names(topic, staff_guardian, tag_names) + end + end, + } + end + end + end + + def import_topic_id(topic_id) + "T#{topic_id}" + end + + def import_posts + puts "", "creating posts" + total_count = @db.count_posts + last_row_id = 0 + + batches do |offset| + rows, last_row_id = @db.fetch_sorted_posts(last_row_id) + break if rows.empty? + + next if all_records_exist?(:posts, rows.map { |row| row["id"] }) + + create_posts(rows, total: total_count, offset: offset) do |row| + topic = topic_lookup_from_imported_post_id(import_topic_id(row["topic_id"])) + if topic.nil? + p "MISSING TOPIC #{row["topic_id"]}" + p row + next + end + { + id: row["id"], + raw: normalize_raw(row["raw"]), + user_id: user_id_from_imported_user_id(row["user_id"]) || Discourse.system_user.id, + topic_id: topic[:topic_id], + created_at: row["created_at"], + } + end + end + end + + def import_pm_posts + puts "", "creating pm posts" + total_count = @db.count_pm_posts + last_row_id = 0 + + batches do |offset| + rows, last_row_id = @db.fetch_pm_posts(last_row_id) + break if rows.empty? + + next if all_records_exist?(:posts, rows.map { |row| row["id"] }) + + create_posts(rows, total: total_count, offset: offset) do |row| + topic = topic_lookup_from_imported_post_id(import_topic_id(row["topic_id"])) + + if topic.nil? + p "MISSING TOPIC #{row["topic_id"]}" + p row + next + end + + { + id: row["id"], + raw: normalize_raw(row["raw"]), + user_id: user_id_from_imported_user_id(row["user_id"]) || Discourse.system_user.id, + topic_id: topic[:topic_id], + created_at: row["created_at"], + } + end + end + end + + def normalize_raw(raw) + raw = raw.gsub('\n', "") + raw.gsub!(/\[\[user:(\d+)\]\]/) do + u = Regexp.last_match(1) + user_id = user_id_from_imported_user_id(u) || Discourse.system_user.id + if user_id + user = User.find(user_id) + "@#{user.username}" + else + u + end + end + raw + end + + def permalink_exists?(url) + Permalink.find_by(url: url) + end + + def csv_parse(table_name) + CSV.foreach( + File.join(@path, "#{table_name}.csv"), + headers: true, + header_converters: :symbol, + skip_blanks: true, + encoding: "bom|utf-8", + ) { |row| yield row } + end +end + +unless ARGV[0] && Dir.exist?(ARGV[0]) + puts "", "Usage:", "", "bundle exec ruby script/import_scripts/yammer.rb DIRNAME", "" + exit 1 +end + +ImportScripts::Yammer.new(ARGV[0]).perform