diff --git a/script/import_scripts/question2answer.rb b/script/import_scripts/question2answer.rb new file mode 100644 index 00000000000..49c76092753 --- /dev/null +++ b/script/import_scripts/question2answer.rb @@ -0,0 +1,565 @@ +require 'mysql2' +require File.expand_path(File.dirname(__FILE__) + "/base.rb") +require 'htmlentities' +begin + require 'php_serialize' # https://github.com/jqr/php-serialize +rescue LoadError + puts + puts 'php_serialize not found.' + puts 'Add to Gemfile, like this: ' + puts + puts "echo gem \\'php-serialize\\' >> Gemfile" + puts "bundle install" + exit +end + +class ImportScripts::Question2Answer < ImportScripts::Base + BATCH_SIZE = 1000 + + # CHANGE THESE BEFORE RUNNING THE IMPORTER + + DB_HOST ||= ENV['DB_HOST'] || "localhost" + DB_NAME ||= ENV['DB_NAME'] + DB_PW ||= ENV['DB_PW'] + DB_USER ||= ENV['DB_USER'] + TIMEZONE ||= ENV['TIMEZONE'] || "America/Los_Angeles" + TABLE_PREFIX ||= ENV['TABLE_PREFIX'] || "qa_" + MAIN_APP_DB_NAME = "primary_db" + + puts "#{DB_USER}:#{DB_PW}@#{DB_HOST} wants #{DB_NAME}" + + def initialize + super + + @old_username_to_new_usernames = {} + + @tz = TZInfo::Timezone.get(TIMEZONE) + + @htmlentities = HTMLEntities.new + + @client = Mysql2::Client.new( + host: DB_HOST, + username: DB_USER, + password: DB_PW, + database: DB_NAME + ) + rescue Exception => e + puts '=' * 50 + puts e.message + puts < #{last_user_id} AND + (EXISTS (SELECT 1 FROM #{TABLE_PREFIX}posts p WHERE p.userid=u.id) or EXISTS (SELECT 1 FROM #{TABLE_PREFIX}uservotes u WHERE u.userid=u.id)) + ORDER BY u.id + LIMIT #{BATCH_SIZE} + SQL + ).to_a + + break if users.empty? + + last_user_id = users[-1]["id"] + before = users.size + users.reject! { |u| @lookup.user_already_imported?(u["id"].to_i) } + + create_users(users, total: user_count, offset: offset) do |user| + email = user["email"].presence + + username = @htmlentities.decode(user["email"]).strip.split("@").first + avatar_url = "https://your_image_bucket/#{user['cdn_slug']}" if user['cdn_slug'] + { + id: user["id"], + name: "#{user['first_name']} #{user['last_name']}", + username: username, + website: user['website'], + email: email, + avatar_url: avatar_url, + custom_fields: user["custom_field_1"] ? { user_field_1: user["custom_field_1"] } : {}, + location: user["city"] && user["state"] ? "#{user['city']}, #{user['state']}" : nil, + created_at: user["created_at"], + last_seen_at: user["last_sign_in_at"], + post_create_action: proc do |u| + @old_username_to_new_usernames[user["username"]] = u.username + end + } + end + end + end + + def import_categories + puts "", "importing top level categories..." + + categories = mysql_query("SELECT categoryid, parentid, title, position FROM #{TABLE_PREFIX}categories ORDER BY categoryid").to_a + + top_level_categories = categories.select { |c| c["parentid"].nil? } + + create_categories(top_level_categories) do |category| + { + id: category["categoryid"], + name: @htmlentities.decode(category["title"]).strip, + position: category["position"] + } + end + + puts "", "importing children categories..." + + children_categories = categories.select { |c| !c["parentid"].nil? } + top_level_category_ids = Set.new(top_level_categories.map { |c| c["categoryid"] }) + + # cut down the tree to only 2 levels of categories + children_categories.each do |cc| + while !top_level_category_ids.include?(cc["parentid"]) + cc["parentid"] = categories.detect { |c| c["categoryid"] == cc["parentid"] }["parentid"] + end + end + + create_categories(children_categories) do |category| + { + id: category["categoryid"], + name: @htmlentities.decode(category["title"]).strip, + position: category["position"], + parent_category_id: category_id_from_imported_category_id(category["parentid"]) + } + end + end + + def import_topics + puts "", "importing topics..." + + topic_count = mysql_query("SELECT COUNT(postid) count FROM #{TABLE_PREFIX}posts WHERE type in ('Q', 'Q_HIDDEN')").first["count"] + + last_topic_id = -1 + + batches(BATCH_SIZE) do |offset| + topics = mysql_query(<<-SQL + SELECT p.postid, p.type, p.categoryid, p.closedbyid, p.userid postuserid, p.views, p.created, p.title, p.content raw + FROM #{TABLE_PREFIX}posts p + WHERE p.postid > #{last_topic_id} + and p.parentid IS NULL + and type IN ('Q', 'Q_HIDDEN') + ORDER BY p.postid + LIMIT #{BATCH_SIZE} + SQL + ).to_a + + break if topics.empty? + + last_topic_id = topics[-1]["postid"] + topics.reject! { |t| @lookup.post_already_imported?("thread-#{t["postid"]}") } + + create_posts(topics, total: topic_count, offset: offset) do |topic| + begin + raw = preprocess_post_raw(topic["raw"]) + rescue => e + puts e.message + end + + topic_id = "thread-#{topic["postid"]}" + t = { + id: topic_id, + user_id: user_id_from_imported_user_id(topic["postuserid"]) || Discourse::SYSTEM_USER_ID, + title: @htmlentities.decode(topic["title"]).strip[0...255], + category: category_id_from_imported_category_id(topic["categoryid"]), + raw: raw, + created_at: topic["created"], + visible: topic["closedbyid"].to_i == 0 && topic["type"] != 'Q_HIDDEN', + views: topic["views"], + } + t + end + + # uncomment below lines to create permalink + topics.each do |thread| + topic_id = "thread-#{thread["postid"]}" + topic = topic_lookup_from_imported_post_id(topic_id) + if topic.present? + title_slugified = slugify(thread["title"], false, 50) if thread["title"].present? + url_slug = "#{thread["postid"]}/#{title_slugified}" if thread["title"].present? + Permalink.create(url: url_slug, topic_id: topic[:topic_id].to_i) if url_slug.present? && topic[:topic_id].present? + end + end + + end + end + + def slugify(title, ascii_only, max_length) + words = title.downcase.gsub(/[^a-zA-Z0-9\s]/, '').split(" ") + word_lengths = {} + + words.each_with_index do |word, idx| + word_lengths[idx] = word.length + end + + remaining = max_length + if word_lengths.inject(0) {|sum, (k,v)| sum + v } > remaining + word_lengths = Hash[word_lengths.sort {|x,y| y[1] <=> x[1]}] + word_lengths.each do |idx, word_length| + if remaining > 0 + remaining -= word_length + else + words[idx] = nil + end + end + end + words = words.compact.join("-") + end + + def import_posts + puts "", "importing posts..." + + post_count = mysql_query(<<-SQL + SELECT COUNT(postid) count + FROM #{TABLE_PREFIX}posts p + WHERE p.parentid IS NOT NULL + SQL + ).first["count"] + + last_post_id = -1 + + batches(BATCH_SIZE) do |offset| + posts = mysql_query(<<-SQL + SELECT p.postid, p.type, p.parentid, p.categoryid, p.closedbyid, p.userid, p.views, p.created, p.title, p.content + FROM #{TABLE_PREFIX}posts p + WHERE p.parentid IS NOT NULL + AND p.postid > #{last_post_id} + AND type in ('A') + AND closedbyid IS NULL + ORDER BY p.postid + LIMIT #{BATCH_SIZE} + SQL + ).to_a + + break if posts.empty? + last_post_id = posts[-1]["postid"] + posts.reject! { |p| @lookup.post_already_imported?(p["postid"].to_i) } + + create_posts(posts, total: post_count, offset: offset) do |post| + begin + raw = preprocess_post_raw(post["content"]) + rescue => e + puts e.message + end + next if raw.blank? + next unless topic = topic_lookup_from_imported_post_id("thread-#{post["parentid"]}") + + p = { + id: post["postid"], + user_id: user_id_from_imported_user_id(post["userid"]) || Discourse::SYSTEM_USER_ID, + topic_id: topic[:topic_id], + raw: raw, + created_at: post["created"], + } + if parent = topic_lookup_from_imported_post_id(post["parentid"]) + p[:reply_to_post_number] = parent[:post_number] + end + p + end + end + end + + def import_likes + puts "", "importing likes..." + likes = mysql_query(<<-SQL + SELECT postid, userid + FROM #{TABLE_PREFIX}uservotes u + WHERE u.vote=1 + SQL + ).to_a + likes.each do |like| + post = Post.find_by(id: post_id_from_imported_post_id("thread-#{like['postid']}")) + user = User.find_by(id: user_id_from_imported_user_id(like["userid"])) + begin + PostAction.act(user, post, 2) if user && post + rescue => e + debugger + end + end + end + + def post_process_posts + puts "", "Postprocessing posts..." + + current = 0 + max = Post.count + + Post.find_each do |post| + begin + new_raw = postprocess_post_raw(post.raw) + if new_raw != post.raw + post.raw = new_raw + post.save + end + rescue PrettyText::JavaScriptError + nil + ensure + print_status(current += 1, max) + end + end + end + + def preprocess_post_raw(raw) + return "" if raw.blank? + + # decode HTML entities + raw = @htmlentities.decode(raw) + raw = ActionView::Base.full_sanitizer.sanitize raw + + # fix whitespaces + raw.gsub!(/(\\r)?\\n/, "\n") + raw.gsub!("\\t", "\t") + + raw.gsub!('
', "\n") + + # [HTML]...[/HTML] + raw.gsub!(/\[html\]/i, "\n```html\n") + raw.gsub!(/\[\/html\]/i, "\n```\n") + + # [PHP]...[/PHP] + raw.gsub!(/\[php\]/i, "\n```php\n") + raw.gsub!(/\[\/php\]/i, "\n```\n") + + # [HIGHLIGHT="..."] + raw.gsub!(/\[highlight="?(\w+)"?\]/i) { "\n```#{$1.downcase}\n" } + + # [CODE]...[/CODE] + # [HIGHLIGHT]...[/HIGHLIGHT] + raw.gsub!(/\[\/?code\]/i, "\n```\n") + raw.gsub!(/\[\/?highlight\]/i, "\n```\n") + + # [SAMP]...[/SAMP] + raw.gsub!(/\[\/?samp\]/i, "`") + + # replace all chevrons with HTML entities + # NOTE: must be done + # - AFTER all the "code" processing + # - BEFORE the "quote" processing + raw.gsub!(/`([^`]+)`/im) { "`" + $1.gsub("<", "\u2603") + "`" } + raw.gsub!("<", "<") + raw.gsub!("\u2603", "<") + + raw.gsub!(/`([^`]+)`/im) { "`" + $1.gsub(">", "\u2603") + "`" } + raw.gsub!(">", ">") + raw.gsub!("\u2603", ">") + + # [URL=...]...[/URL] + raw.gsub!(/\[url="?([^"]+?)"?\](.*?)\[\/url\]/im) { "[#{$2.strip}](#{$1})" } + raw.gsub!(/\[url="?(.+?)"?\](.+)\[\/url\]/im) { "[#{$2.strip}](#{$1})" } + + # [URL]...[/URL] + # [MP3]...[/MP3] + raw.gsub!(/\[\/?url\]/i, "") + raw.gsub!(/\[\/?mp3\]/i, "") + + # [MENTION][/MENTION] + raw.gsub!(/\[mention\](.+?)\[\/mention\]/i) do + old_username = $1 + if @old_username_to_new_usernames.has_key?(old_username) + old_username = @old_username_to_new_usernames[old_username] + end + "@#{old_username}" + end + + # [FONT=blah] and [COLOR=blah] + raw.gsub! /\[FONT=.*?\](.*?)\[\/FONT\]/im, '\1' + raw.gsub! /\[COLOR=.*?\](.*?)\[\/COLOR\]/im, '\1' + raw.gsub! /\[COLOR=#.*?\](.*?)\[\/COLOR\]/im, '\1' + + raw.gsub! /\[SIZE=.*?\](.*?)\[\/SIZE\]/im, '\1' + raw.gsub! /\[h=.*?\](.*?)\[\/h\]/im, '\1' + + # [CENTER]...[/CENTER] + raw.gsub! /\[CENTER\](.*?)\[\/CENTER\]/im, '\1' + + # [INDENT]...[/INDENT] + raw.gsub! /\[INDENT\](.*?)\[\/INDENT\]/im, '\1' + raw.gsub! /\[TABLE\](.*?)\[\/TABLE\]/im, '\1' + raw.gsub! /\[TR\](.*?)\[\/TR\]/im, '\1' + raw.gsub! /\[TD\](.*?)\[\/TD\]/im, '\1' + raw.gsub! /\[TD="?.*?"?\](.*?)\[\/TD\]/im, '\1' + + # [QUOTE]...[/QUOTE] + raw.gsub!(/\[quote\](.+?)\[\/quote\]/im) { |quote| + quote.gsub!(/\[quote\](.+?)\[\/quote\]/im) { "\n#{$1}\n" } + quote.gsub!(/\n(.+?)/) { "\n> #{$1}" } + } + + # [QUOTE=]...[/QUOTE] + raw.gsub!(/\[quote=([^;\]]+)\](.+?)\[\/quote\]/im) do + old_username, quote = $1, $2 + if @old_username_to_new_usernames.has_key?(old_username) + old_username = @old_username_to_new_usernames[old_username] + end + "\n[quote=\"#{old_username}\"]\n#{quote}\n[/quote]\n" + end + + # [YOUTUBE][/YOUTUBE] + raw.gsub!(/\[youtube\](.+?)\[\/youtube\]/i) { "\n//youtu.be/#{$1}\n" } + + # [VIDEO=youtube;]...[/VIDEO] + raw.gsub!(/\[video=youtube;([^\]]+)\].*?\[\/video\]/i) { "\n//youtu.be/#{$1}\n" } + + # More Additions .... + + # [spoiler=Some hidden stuff]SPOILER HERE!![/spoiler] + raw.gsub!(/\[spoiler="?(.+?)"?\](.+?)\[\/spoiler\]/im) { "\n#{$1}\n[spoiler]#{$2}[/spoiler]\n" } + + # [IMG][IMG]http://i63.tinypic.com/akga3r.jpg[/IMG][/IMG] + raw.gsub!(/\[IMG\]\[IMG\](.+?)\[\/IMG\]\[\/IMG\]/i) { "[IMG]#{$1}[/IMG]" } + + # convert list tags to ul and list=1 tags to ol + # (basically, we're only missing list=a here...) + # (https://meta.discourse.org/t/phpbb-3-importer-old/17397) + raw.gsub!(/\[list\](.*?)\[\/list\]/im, '[ul]\1[/ul]') + raw.gsub!(/\[list=1\](.*?)\[\/list\]/im, '[ol]\1[/ol]') + raw.gsub!(/\[list\](.*?)\[\/list:u\]/im, '[ul]\1[/ul]') + raw.gsub!(/\[list=1\](.*?)\[\/list:o\]/im, '[ol]\1[/ol]') + # convert *-tags to li-tags so bbcode-to-md can do its magic on phpBB's lists: + raw.gsub!(/\[\*\]\n/, '') + raw.gsub!(/\[\*\](.*?)\[\/\*:m\]/, '[li]\1[/li]') + raw.gsub!(/\[\*\](.*?)\n/, '[li]\1[/li]') + raw.gsub!(/\[\*=1\]/, '') + + raw.strip! + raw + end + + def postprocess_post_raw(raw) + # [QUOTE=;]...[/QUOTE] + raw.gsub!(/\[quote=([^;]+);(\d+)\](.+?)\[\/quote\]/im) do + old_username, post_id, quote = $1, $2, $3 + + if @old_username_to_new_usernames.has_key?(old_username) + old_username = @old_username_to_new_usernames[old_username] + end + + if topic_lookup = topic_lookup_from_imported_post_id(post_id) + post_number = topic_lookup[:post_number] + topic_id = topic_lookup[:topic_id] + "\n[quote=\"#{old_username},post:#{post_number},topic:#{topic_id}\"]\n#{quote}\n[/quote]\n" + else + "\n[quote=\"#{old_username}\"]\n#{quote}\n[/quote]\n" + end + end + + # remove attachments + raw.gsub!(/\[attach[^\]]*\]\d+\[\/attach\]/i, "") + + # [THREAD][/THREAD] + # ==> http://my.discourse.org/t/slug/ + raw.gsub!(/\[thread\](\d+)\[\/thread\]/i) do + thread_id = $1 + if topic_lookup = topic_lookup_from_imported_post_id("thread-#{thread_id}") + topic_lookup[:url] + else + $& + end + end + + # [THREAD=]...[/THREAD] + # ==> [...](http://my.discourse.org/t/slug/) + raw.gsub!(/\[thread=(\d+)\](.+?)\[\/thread\]/i) do + thread_id, link = $1, $2 + if topic_lookup = topic_lookup_from_imported_post_id("thread-#{thread_id}") + url = topic_lookup[:url] + "[#{link}](#{url})" + else + $& + end + end + + # [POST][/POST] + # ==> http://my.discourse.org/t/slug// + raw.gsub!(/\[post\](\d+)\[\/post\]/i) do + post_id = $1 + if topic_lookup = topic_lookup_from_imported_post_id(post_id) + topic_lookup[:url] + else + $& + end + end + + # [POST=]...[/POST] + # ==> [...](http://my.discourse.org/t///) + raw.gsub!(/\[post=(\d+)\](.+?)\[\/post\]/i) do + post_id, link = $1, $2 + if topic_lookup = topic_lookup_from_imported_post_id(post_id) + url = topic_lookup[:url] + "[#{link}](#{url})" + else + $& + end + end + + raw + end + + def create_permalinks + puts '', 'Creating Permalink File...', '' + #creates permalinks for q2a category links + Category.find_each do |category| + ccf = category.custom_fields + + if ccf && ccf["import_id"] + url = category.parent_category ? "#{category.parent_category.slug}/#{category.slug}" : category.slug + Permalink.create(url: url, category_id: category.id) rescue nil + end + end + end + + def parse_timestamp(timestamp) + Time.zone.at(@tz.utc_to_local(timestamp)) + end + + def fake_email + SecureRandom.hex << "@domain.com" + end + + def mysql_query(sql) + @client.query(sql, cache_rows: true) + end + +end + +ImportScripts::Question2Answer.new.perform