From 2d909f789427f9bb7ed62184d8af4a83769bef2d Mon Sep 17 00:00:00 2001 From: Arpit Jalan Date: Thu, 3 Aug 2017 21:13:02 +0530 Subject: [PATCH] new phpBB PostgreSQL bulk import script --- Gemfile | 1 + lib/tasks/import.rake | 16 + script/bulk_import/base.rb | 8 +- script/bulk_import/phpbb_postgresql.rb | 482 +++++++++++++++++++++++++ 4 files changed, 505 insertions(+), 2 deletions(-) create mode 100644 script/bulk_import/phpbb_postgresql.rb diff --git a/Gemfile b/Gemfile index ba6165e3506..61d79c30970 100644 --- a/Gemfile +++ b/Gemfile @@ -194,4 +194,5 @@ if ENV["IMPORT"] == "1" gem 'mysql2' gem 'redcarpet' gem 'sqlite3', '~> 1.3.13' + gem 'ruby-bbcode-to-md', :github => 'nlalonde/ruby-bbcode-to-md' end diff --git a/lib/tasks/import.rake b/lib/tasks/import.rake index a0ec50de6c4..f4542d51a38 100644 --- a/lib/tasks/import.rake +++ b/lib/tasks/import.rake @@ -428,3 +428,19 @@ def exec_sql(sql) ActiveRecord::Base.exec_sql(sql) end end + +task "import:create_phpbb_permalinks" => :environment do + log 'Creating Permalinks...' + + # /[^\/]+\/.*-t(\d+).html + SiteSetting.permalink_normalizations = '/[^\/]+\/.*-t(\d+).html/thread/\1' + + Topic.listable_topics.find_each do |topic| + tcf = topic.custom_fields + if tcf && tcf["import_id"] + Permalink.create(url: "thread/#{tcf["import_id"]}", topic_id: topic.id) rescue nil + end + end + + log "Done!" +end diff --git a/script/bulk_import/base.rb b/script/bulk_import/base.rb index 189d154740a..6ef8e3b0117 100644 --- a/script/bulk_import/base.rb +++ b/script/bulk_import/base.rb @@ -229,8 +229,8 @@ class BulkImport::Base group[:name] = group_name end - group[:title] = group[:title].scrub.strip.presence - group[:bio_raw] = group[:bio_raw].scrub.strip.presence + group[:title] = group[:title].scrub.strip.presence if group[:title].present? + group[:bio_raw] = group[:bio_raw].scrub.strip.presence if group[:bio_raw].present? group[:bio_cooked] = pre_cook(group[:bio_raw]) if group[:bio_raw].present? group[:created_at] ||= NOW group[:updated_at] ||= group[:created_at] @@ -307,6 +307,7 @@ class BulkImport::Base def process_user_profile(user_profile) user_profile[:bio_raw] = (user_profile[:bio_raw].presence || "").scrub.strip.presence user_profile[:bio_cooked] = pre_cook(user_profile[:bio_raw]) if user_profile[:bio_raw].present? + user_profile[:views] ||= 0 user_profile end @@ -526,6 +527,9 @@ class BulkImport::Base value: imported_id, } end + rescue => e + puts e.message + puts e.backtrace.join("\n") end def create_custom_fields(table, name, rows) diff --git a/script/bulk_import/phpbb_postgresql.rb b/script/bulk_import/phpbb_postgresql.rb new file mode 100644 index 00000000000..932a96a31b8 --- /dev/null +++ b/script/bulk_import/phpbb_postgresql.rb @@ -0,0 +1,482 @@ +require_relative "base" +require "pg" +require "htmlentities" +require 'ruby-bbcode-to-md' + +class BulkImport::PhpBB < BulkImport::Base + + SUSPENDED_TILL ||= Date.new(3000, 1, 1) + TABLE_PREFIX ||= ENV['TABLE_PREFIX'] || "phpbb_" + CHARSET_MAP = { + "armscii8" => nil, + "ascii" => Encoding::US_ASCII, + "big5" => Encoding::Big5, + "binary" => Encoding::ASCII_8BIT, + "cp1250" => Encoding::Windows_1250, + "cp1251" => Encoding::Windows_1251, + "cp1256" => Encoding::Windows_1256, + "cp1257" => Encoding::Windows_1257, + "cp850" => Encoding::CP850, + "cp852" => Encoding::CP852, + "cp866" => Encoding::IBM866, + "cp932" => Encoding::Windows_31J, + "dec8" => nil, + "eucjpms" => Encoding::EucJP_ms, + "euckr" => Encoding::EUC_KR, + "gb2312" => Encoding::EUC_CN, + "gbk" => Encoding::GBK, + "geostd8" => nil, + "greek" => Encoding::ISO_8859_7, + "hebrew" => Encoding::ISO_8859_8, + "hp8" => nil, + "keybcs2" => nil, + "koi8r" => Encoding::KOI8_R, + "koi8u" => Encoding::KOI8_U, + "latin1" => Encoding::ISO_8859_1, + "latin2" => Encoding::ISO_8859_2, + "latin5" => Encoding::ISO_8859_9, + "latin7" => Encoding::ISO_8859_13, + "macce" => Encoding::MacCentEuro, + "macroman" => Encoding::MacRoman, + "sjis" => Encoding::SHIFT_JIS, + "swe7" => nil, + "tis620" => Encoding::TIS_620, + "ucs2" => Encoding::UTF_16BE, + "ujis" => Encoding::EucJP_ms, + "utf8" => Encoding::UTF_8, + } + + def initialize + super + + charset = ENV["DB_CHARSET"] || "utf8" + database = ENV["DB_NAME"] || "flightaware" + password = ENV["DB_PASSWORD"] || "discourse" + + @html_entities = HTMLEntities.new + @encoding = CHARSET_MAP[charset] + + @client = PG.connect(dbname: database, password: password) + + @smiley_map = {} + add_default_smilies + end + + def execute + import_groups + import_users + import_group_users + + import_user_emails + import_user_profiles + + import_categories + import_topics + import_posts + + import_private_topics + import_topic_allowed_users + import_private_posts + end + + def import_groups + puts "Importing groups..." + + groups = psql_query <<-SQL + SELECT group_id, group_name, group_desc + FROM #{TABLE_PREFIX}groups + WHERE group_id > #{@last_imported_group_id} + ORDER BY group_id + SQL + + create_groups(groups) do |row| + { + imported_id: row["group_id"], + name: normalize_text(row["group_name"]), + bio_raw: normalize_text(row["group_desc"]) + } + end + end + + def import_users + puts "Importing users..." + + users = psql_query <<-SQL + SELECT u.user_id, u.username, u.user_email, u.user_regdate, u.user_lastvisit, u.user_ip, + u.user_type, u.user_inactive_reason, g.group_id, g.group_name, b.ban_start, b.ban_end, b.ban_reason, + u.user_posts, u.user_website, u.user_from, u.user_birthday, u.user_avatar_type, u.user_avatar + FROM #{TABLE_PREFIX}users u + LEFT OUTER JOIN #{TABLE_PREFIX}groups g ON (g.group_id = u.group_id) + LEFT OUTER JOIN #{TABLE_PREFIX}banlist b ON ( + u.user_id = b.ban_userid AND b.ban_exclude = 0 AND + b.ban_end = 0 + ) + WHERE u.user_id > #{@last_imported_user_id} + ORDER BY u.user_id + SQL + + create_users(users) do |row| + u = { + imported_id: row["user_id"], + username: normalize_text(row["username"]), + created_at: Time.zone.at(row["user_regdate"].to_i), + last_seen_at: row["user_lastvisit"] == 0 ? Time.zone.at(row["user_regdate"].to_i) : Time.zone.at(row["user_lastvisit"].to_i), + trust_level: row["user_posts"] == 0 ? TrustLevel[0] : TrustLevel[1], + date_of_birth: parse_birthday(row["user_birthday"]), + primary_group_id: group_id_from_imported_id(row["group_id"]) + } + u[:ip_address] = row["user_ip"][/\b(?:\d{1,3}\.){3}\d{1,3}\b/] if row["user_ip"].present? + if row["ban_start"] + u[:suspended_at] = Time.zone.at(row["ban_start"].to_i) + u[:suspended_till] = row["ban_end"].to_i > 0 ? Time.zone.at(row["ban_end"].to_i) : SUSPENDED_TILL + end + u + end + end + + def import_user_emails + puts "Importing user emails..." + + users = psql_query <<-SQL + SELECT user_id, user_email, user_regdate + FROM #{TABLE_PREFIX}users u + WHERE user_id > #{@last_imported_user_id} + ORDER BY user_id + SQL + + create_user_emails(users) do |row| + { + imported_id: row["user_id"], + imported_user_id: row["user_id"], + email: row["user_email"], + created_at: Time.zone.at(row["user_regdate"].to_i) + } + end + end + + def import_group_users + puts "Importing group users..." + + group_users = psql_query <<-SQL + SELECT user_id, group_id + FROM #{TABLE_PREFIX}users u + WHERE user_id > #{@last_imported_user_id} + SQL + + create_group_users(group_users) do |row| + { + group_id: group_id_from_imported_id(row["group_id"]), + user_id: user_id_from_imported_id(row["user_id"]), + } + end + end + + def import_user_profiles + puts "Importing user profiles..." + + user_profiles = psql_query <<-SQL + SELECT user_id, user_website, user_from + FROM #{TABLE_PREFIX}users + WHERE user_id > #{@last_imported_user_id} + ORDER BY user_id + SQL + + create_user_profiles(user_profiles) do |row| + { + user_id: user_id_from_imported_id(row["user_id"]), + website: (URI.parse(row["user_website"]).to_s rescue nil), + location: row["user_from"], + } + end + end + + def import_categories + puts "Importing categories..." + + categories = psql_query(<<-SQL + SELECT forum_id, parent_id, forum_name, forum_desc + FROM #{TABLE_PREFIX}forums + WHERE forum_id > #{@last_imported_category_id} + ORDER BY parent_id, left_id + SQL + ).to_a + + return if categories.empty? + + parent_categories = categories.select { |c| c["parent_id"].to_i == 0 } + children_categories = categories.select { |c| c["parent_id"].to_i != 0 } + + puts "Importing parent categories..." + create_categories(parent_categories) do |row| + { + imported_id: row["forum_id"], + name: normalize_text(row["forum_name"]), + description: normalize_text(row["forum_desc"]) + } + end + + puts "Importing children categories..." + create_categories(children_categories) do |row| + { + imported_id: row["forum_id"], + name: normalize_text(row["forum_name"]), + description: normalize_text(row["forum_desc"]), + parent_category_id: category_id_from_imported_id(row["parent_id"]) + } + end + end + + def import_topics + puts "Importing topics..." + + topics = psql_query <<-SQL + SELECT topic_id, topic_title, forum_id, topic_poster, topic_time, topic_views + FROM #{TABLE_PREFIX}topics + WHERE topic_id > #{@last_imported_topic_id} + AND EXISTS (SELECT 1 FROM #{TABLE_PREFIX}posts WHERE #{TABLE_PREFIX}posts.topic_id = #{TABLE_PREFIX}topics.topic_id) + ORDER BY topic_id + SQL + + create_topics(topics) do |row| + { + imported_id: row["topic_id"], + title: normalize_text(row["topic_title"]), + category_id: category_id_from_imported_id(row["forum_id"]), + user_id: user_id_from_imported_id(row["topic_poster"]), + created_at: Time.zone.at(row["topic_time"].to_i), + views: row["topic_views"] + } + end + end + + def import_posts + puts "Importing posts..." + + posts = psql_query <<-SQL + SELECT p.post_id, p.topic_id, p.poster_id, p.post_time, p.post_text + FROM #{TABLE_PREFIX}posts p + JOIN #{TABLE_PREFIX}topics t ON t.topic_id = p.topic_id + WHERE p.post_id > #{@last_imported_post_id} + ORDER BY p.post_id + SQL + + create_posts(posts) do |row| + { + imported_id: row["post_id"], + topic_id: topic_id_from_imported_id(row["topic_id"]), + user_id: user_id_from_imported_id(row["poster_id"]), + created_at: Time.zone.at(row["post_time"].to_i), + raw: process_raw_text(row["post_text"]), + } + end + end + + def import_private_topics + puts "Importing private topics..." + + @imported_topics = {} + + topics = psql_query <<-SQL + SELECT msg_id, message_subject, author_id, to_address, message_time + FROM #{TABLE_PREFIX}privmsgs + WHERE msg_id > (#{@last_imported_private_topic_id - PRIVATE_OFFSET}) + ORDER BY msg_id + SQL + + create_topics(topics) do |row| + user_ids = get_message_recipients(row["author_id"], row["to_address"]) + title = extract_pm_title(row["message_subject"]) + key = [title, user_ids] + + next if @imported_topics.has_key?(key) || title.blank? + @imported_topics[key] = row["msg_id"].to_i + PRIVATE_OFFSET + + { + archetype: Archetype.private_message, + imported_id: row["msg_id"].to_i + PRIVATE_OFFSET, + title: normalize_text(title), + user_id: user_id_from_imported_id(row["author_id"].to_i), + created_at: Time.zone.at(row["message_time"].to_i) + } + end + end + + def import_topic_allowed_users + puts "Importing topic allowed users..." + + allowed_users = [] + + psql_query(<<-SQL + SELECT msg_id, author_id, to_address + FROM #{TABLE_PREFIX}privmsgs + WHERE msg_id > (#{@last_imported_private_topic_id - PRIVATE_OFFSET}) + ORDER BY msg_id + SQL + ).each do |row| + next unless topic_id = topic_id_from_imported_id(row["msg_id"].to_i + PRIVATE_OFFSET) + + user_ids = get_message_recipients(row["author_id"], row["to_address"]) + user_ids.each do |id| + next unless user_id = user_id_from_imported_id(id.to_i) + allowed_users << [topic_id, user_id] + end + end + + create_topic_allowed_users(allowed_users) do |row| + { + topic_id: row[0], + user_id: row[1] + } + end + end + + def import_private_posts + puts "Importing private posts..." + + posts = psql_query <<-SQL + SELECT msg_id, message_subject, author_id, to_address, message_time, message_text + FROM #{TABLE_PREFIX}privmsgs + WHERE msg_id > (#{@last_imported_private_topic_id - PRIVATE_OFFSET}) + ORDER BY msg_id + SQL + + create_posts(posts) do |row| + user_ids = get_message_recipients(row["author_id"], row["to_address"]) + title = extract_pm_title(row["message_subject"]) + key = [title, user_ids] + + next unless topic_id = topic_id_from_imported_id(@imported_topics[key]) + { + imported_id: row["msg_id"].to_i + PRIVATE_OFFSET, + topic_id: topic_id, + user_id: user_id_from_imported_id(row["author_id"].to_i), + created_at: Time.zone.at(row["message_time"].to_i), + raw: process_raw_text(row["message_text"]) + } + end + end + + def get_message_recipients(from, to) + user_ids = to.split(':') + user_ids.map! { |u| u[2..-1].to_i } + user_ids.push(from.to_i) + user_ids.uniq! + user_ids = user_ids.flatten.map(&:to_i).sort + user_ids + end + + def extract_pm_title(title) + pm_title = CGI.unescapeHTML(title) + pm_title = title.gsub(/^Re\s*:\s*/i, "") rescue nil + pm_title + end + + def normalize_text(text) + return nil unless text.present? + @html_entities.decode(normalize_charset(text.presence || "").scrub) + end + + def normalize_charset(text) + return text if @encoding == Encoding::UTF_8 + return text && text.encode(@encoding).force_encoding(Encoding::UTF_8) + end + + def parse_birthday(birthday) + return if birthday.blank? + date_of_birth = Date.strptime(birthday.gsub(/[^\d-]+/, ""), "%m-%d-%Y") rescue nil + return if date_of_birth.nil? + date_of_birth.year < 1904 ? Date.new(1904, date_of_birth.month, date_of_birth.day) : date_of_birth + end + + def psql_query(sql) + @client.query(sql) + end + + def process_raw_text(raw) + return "" if raw.blank? + text = raw.dup + text = CGI.unescapeHTML(text) + + text.gsub!(/:(?:\w{8})\]/, ']') + + text = bbcode_to_md(text) + + # Some links look like this: http://www.onegameamonth.com + text.gsub!(/(.+)<\/a>/i, '[\2](\1)') + + # phpBB shortens link text like this, which breaks our markdown processing: + # [http://answers.yahoo.com/question/index ... 223AAkkPli](http://answers.yahoo.com/question/index?qid=20070920134223AAkkPli) + # + # Work around it for now: + text.gsub!(/\[http(s)?:\/\/(www\.)?/i, '[') + + # convert list tags to ul and list=1 tags to ol + # list=a is not supported, so handle it like list=1 + # list=9 and list=x have the same result as list=1 and list=a + text.gsub!(/\[list\](.*?)\[\/list:u\]/mi, '[ul]\1[/ul]') + text.gsub!(/\[list=.*?\](.*?)\[\/list:o\]/mi, '[ol]\1[/ol]') + + # convert *-tags to li-tags so bbcode-to-md can do its magic on phpBB's lists: + text.gsub!(/\[\*\](.*?)\[\/\*:m\]/mi, '[li]\1[/li]') + + # [QUOTE=""] -- add newline + text.gsub!(/(\[quote="[a-zA-Z\d]+"\])/i) { "#{$1}\n" } + + # [/QUOTE] -- add newline + text.gsub!(/(\[\/quote\])/i) { "\n#{$1}" } + + # :) is encoded as :) + text.gsub!(/(.*?)/) do + smiley = $1 + @smiley_map.fetch(smiley) do + # upload_smiley(smiley, $2, $3, $4) || smiley_as_text(smiley) + @smiley_map[smiley] = smiley + end + end + + text + end + + protected + + def bbcode_to_md(text) + begin + text.bbcode_to_md(false) + rescue => e + puts "Problem converting \n#{text}\n using ruby-bbcode-to-md" + text + end + end + + def add_default_smilies + { + [':D', ':-D', ':grin:'] => ':smiley:', + [':)', ':-)', ':smile:'] => ':slight_smile:', + [';)', ';-)', ':wink:'] => ':wink:', + [':(', ':-(', ':sad:'] => ':frowning:', + [':o', ':-o', ':eek:'] => ':astonished:', + [':shock:'] => ':open_mouth:', + [':?', ':-?', ':???:'] => ':confused:', + ['8-)', ':cool:'] => ':sunglasses:', + [':lol:'] => ':laughing:', + [':x', ':-x', ':mad:'] => ':angry:', + [':P', ':-P', ':razz:'] => ':stuck_out_tongue:', + [':oops:'] => ':blush:', + [':cry:'] => ':cry:', + [':evil:'] => ':imp:', + [':twisted:'] => ':smiling_imp:', + [':roll:'] => ':unamused:', + [':!:'] => ':exclamation:', + [':?:'] => ':question:', + [':idea:'] => ':bulb:', + [':arrow:'] => ':arrow_right:', + [':|', ':-|'] => ':neutral_face:', + [':geek:'] => ':nerd:' + }.each do |smilies, emoji| + smilies.each { |smiley| @smiley_map[smiley] = emoji } + end + end + +end + +BulkImport::PhpBB.new.run