require 'sqlite3' require File.expand_path(File.dirname(__FILE__) + "/base.rb") # Paste these lines into your shell before running this: =begin export MBOX_SUBDIR="messages" # subdirectory with mbox files export LIST_NAME=LIST_NAME export DEFAULT_TRUST_LEVEL=1 export DATA_DIR=~/data/import export SPLIT_AT="^From " # or "^From (.*)" =end # If you change the functionality of this script, please consider updating this HOWTO: # https://meta.discourse.org/t/howto-import-mbox-mailing-list-files/51233 class ImportScripts::Mbox < ImportScripts::Base include ActiveModel::Validations # CHANGE THESE BEFORE RUNNING THE IMPORTER MBOX_SUBDIR = ENV['MBOX_SUBDIR'] || "messages" # subdirectory with mbox files LIST_NAME = ENV['LIST_NAME'] || "" # Will remove [LIST_NAME] from Subjects DEFAULT_TRUST_LEVEL = ENV['DEFAULT_TRUST_LEVEL'] || 1 DATA_DIR = ENV['DATA_DIR'] || "~/data/import" MBOX_DIR = File.expand_path(DATA_DIR) # where index.db will be created BATCH_SIZE = 1000 # Site settings SiteSetting.disable_emails = true # Comment out if each file contains a single message # Use formail to split yourself: http://linuxcommand.org/man_pages/formail1.html # SPLIT_AT = /^From (.*) at/ # for Google Groups? SPLIT_AT = /#{ENV['SPLIT_AT']}/ || /^From / # for standard MBOX files # Will create a category if it doesn't exist # create subdirectories in MBOX_SUBDIR with categories CATEGORY_MAPPINGS = { "default" => "uncategorized", # ex: "jobs-folder" => "jobs" } unless File.directory?(MBOX_DIR) puts "Cannot find import directory #{MBOX_DIR}. Giving up." exit end validates_format_of :email, with: /\A([^@\s]+)@((?:[-a-z0-9]+\.)+[a-z]{2,})\Z/i, on: :create def execute import_categories create_email_indices create_user_indices massage_indices import_users create_forum_topics import_replies # replace_email_addresses # uncomment to replace all email address with @username end def import_categories mappings = CATEGORY_MAPPINGS.values - ['uncategorized'] create_categories(mappings) do |c| { id: c, name: c } end end def open_db SQLite3::Database.new("#{MBOX_DIR}/index.db") end def each_line(f) infile = File.open(f, 'r') if f.ends_with?('.gz') gz = Zlib::GzipReader.new(infile) gz.each_line do |line| yield line end else infile.each_line do |line| yield line end end ensure infile.close end def all_messages files = Dir["#{MBOX_DIR}/#{MBOX_SUBDIR}/*"] CATEGORY_MAPPINGS.keys.each do |k| files << Dir["#{MBOX_DIR}/#{k}/*"] end files.flatten! files.sort! files.each_with_index do |f, idx| print_warning "\nProcessing: #{f}" start_time = Time.now if SPLIT_AT.present? msg = "" message_count = 0 each_line(f) do |line| line = line.scrub if line =~ SPLIT_AT p message_count += 1 if !msg.empty? mail = Mail.read_from_string(msg) yield mail, f print_status(idx, files.size, start_time) msg = "" end end msg << line end if !msg.empty? mail = Mail.read_from_string(msg) yield mail, f print_status(idx, files.size, start_time) msg = "" end else raw = File.read(f) mail = Mail.read_from_string(raw) yield mail, f print_status(idx, files.size, start_time) end end end def massage_indices db = open_db db.execute "UPDATE emails SET reply_to = null WHERE reply_to = ''" rows = db.execute "SELECT msg_id, title, reply_to FROM emails ORDER BY datetime(email_date) ASC" msg_ids = {} titles = {} rows.each do |row| msg_ids[row[0]] = true if titles[row[1]].nil? titles[row[1]] = row[0] end end # First, any replies where the parent doesn't exist should have that field cleared not_found = [] rows.each do |row| msg_id, _, reply_to = row if reply_to.present? not_found << msg_id if msg_ids[reply_to].blank? end end puts "#{not_found.size} records couldn't be associated with parents" if not_found.present? db.execute "UPDATE emails SET reply_to = NULL WHERE msg_id IN (#{not_found.map { |nf| "'#{nf}'" }.join(',')})" end dupe_titles = db.execute "SELECT title, COUNT(*) FROM emails GROUP BY title HAVING count(*) > 1" puts "#{dupe_titles.size} replies to wire up" dupe_titles.each do |t| title = t[0] first = titles[title] db.execute "UPDATE emails SET reply_to = ? WHERE title = ? and msg_id <> ?", [first, title, first] end ensure db.close end def extract_name(mail) from_name = nil from = mail[:from] from_email = nil if mail.from.present? from_email = mail.from.dup if from_email.kind_of?(Array) if from_email[0].nil? print_warning "Cannot find email address (ignoring)!\n#{mail}" else from_email = from_email.first.dup from_email.gsub!(/ at /, '@') from_email.gsub!(/ [at] /, '@') # strip real names in ()s. Todo: read into name from_email.gsub!(/ \(.*$/, '') from_email.gsub!(/ /, '') end end end display_names = from.try(:display_names) if display_names.present? from_name = display_names.first end if from_name.blank? && from.to_s =~ /\(([^\)]+)\)/ from_name = Regexp.last_match[1] end from_name = from.to_s if from_name.blank? [from_email, from_name] end def print_warning(message) $stderr.puts "#{message}" end def create_email_indices db = open_db db.execute "DROP TABLE IF EXISTS emails" db.execute <<-SQL CREATE TABLE emails ( msg_id VARCHAR(995) PRIMARY KEY, from_email VARCHAR(255) NOT NULL, from_name VARCHAR(255) NOT NULL, title VARCHAR(255) NOT NULL, reply_to VARCHAR(955) NULL, email_date DATETIME NOT NULL, message TEXT NOT NULL, category VARCHAR(255) NOT NULL ); SQL db.execute "CREATE INDEX by_title ON emails (title)" db.execute "CREATE INDEX by_email ON emails (from_email)" puts "", "creating indices" all_messages do |mail, filename| directory = filename.sub("#{MBOX_DIR}/", '').split("/")[0] category = CATEGORY_MAPPINGS[directory] || CATEGORY_MAPPINGS['default'] || 'uncategorized' msg_id = mail['Message-ID'].to_s # Many ways to get a name from_email, from_name = extract_name(mail) title = clean_title(mail['Subject'].to_s) reply_to = mail['In-Reply-To'].to_s email_date = mail['date'].to_s email_date = DateTime.parse(email_date).to_s unless email_date.blank? if from_email.kind_of?(String) unless from_email.match(/\A[\w+\-.]+@[a-z\d\-]+(\.[a-z\d\-]+)*\.[a-z]+\z/i) print_warning "Ignoring bad email address #{from_email} in #{msg_id}" else db.execute "INSERT OR IGNORE INTO emails (msg_id, from_email, from_name, title, reply_to, email_date, message, category) VALUES (?, ?, ?, ?, ?, ?, ?, ?)", [msg_id, from_email, from_name, title, reply_to, email_date, mail.to_s, category] end end end ensure db.close end def create_user_indices db = open_db db.execute "DROP TABLE IF EXISTS users" db.execute <<-SQL CREATE TABLE users ( email VARCHAR(995) PRIMARY KEY, name VARCHAR(255) NOT NULL ); SQL db.execute "INSERT OR IGNORE INTO users (email, name) SELECT from_email, from_name FROM emails" ensure db.close end def clean_title(title) title ||= "" #Strip mailing list name from subject title = title.gsub(/\[#{Regexp.escape(LIST_NAME)}\]/, '').strip original_length = title.length #Strip Reply prefix from title (Standard and localized) title = title.gsub(/^Re: */i, '') title = title.gsub(/^R: */i, '') #Italian title = title.gsub(/^RIF: */i, '') #Italian #Strip Forward prefix from title (Standard and localized) title = title.gsub(/^Fwd: */i, '') title = title.gsub(/^I: */i, '') #Italian title.strip #In case of mixed localized prefixes there could be many of them if the mail client didn't strip the localized ones if original_length > title.length clean_title(title) else title end end def clean_raw(input) raw = input.dup raw.scrub! raw.gsub!(/-- \nYou received this message because you are subscribed to the Google Groups "[^"]*" group.\nTo unsubscribe from this group and stop receiving emails from it, send an email to [^+@]+\+unsubscribe@googlegroups.com\.\nFor more options, visit https:\/\/groups\.google\.com\/groups\/opt_out\./, '') raw end def import_users puts "", "importing users" db = open_db all_users = db.execute("SELECT name, email FROM users") total_count = all_users.size batches(BATCH_SIZE) do |offset| users = all_users[offset..offset + BATCH_SIZE - 1] break if users.nil? next if all_records_exist? :users, users.map { |u| u[1] } create_users(users, total: total_count, offset: offset) do |u| { id: u[1], email: u[1], name: u[0], trust_level: DEFAULT_TRUST_LEVEL, } end end ensure db.close end def replace_email_addresses puts "", "replacing email addresses with @usernames" post = Post.new total_count = User.real.count progress_count = 0 start_time = Time.now # from: https://meta.discourse.org/t/replace-a-string-in-all-posts/48729/17 # and https://github.com/discourse/discourse/blob/master/lib/tasks/posts.rake#L114-L136 User.find_each do |u| i = 0 find = u.email.dup replace = "@#{u.username}" if !replace.include? "@" puts "Skipping #{replace}" end found = Post.where("raw ILIKE ?", "%#{find}%") next if found.nil? next if found.count < 1 found.each do |p| new_raw = p.raw.dup new_raw = new_raw.gsub!(/#{Regexp.escape(find)}/i, replace) || new_raw if new_raw != p.raw p.revise(Discourse.system_user, { raw: new_raw }, bypass_bump: true) print_warning "\nReplaced #{find} with #{replace} in topic #{p.topic_id}" end end progress_count += 1 puts "" print_status(progress_count, total_count, start_time) end end def parse_email(msg) receiver = Email::Receiver.new(msg) mail = Mail.read_from_string(msg) mail.body selected = receiver.select_body selected.force_encoding(selected.encoding).encode("UTF-8") end def create_forum_topics puts "", "creating forum topics" db = open_db all_topics = db.execute("SELECT msg_id, from_email, from_name, title, email_date, message, category FROM emails WHERE reply_to IS NULL ORDER BY DATE(email_date)") topic_count = all_topics.size batches(BATCH_SIZE) do |offset| topics = all_topics[offset..offset + BATCH_SIZE - 1] break if topics.nil? next if all_records_exist? :posts, topics.map { |t| t[0] } create_posts(topics, total: topic_count, offset: offset) do |t| raw_email = t[5] receiver = Email::Receiver.new(raw_email) mail = Mail.read_from_string(raw_email) mail.body from_email, _ = extract_name(mail) selected = receiver.select_body next unless selected selected = selected.join('') if selected.kind_of?(Array) title = mail.subject username = User.find_by_email(from_email).username # import the attachments raw = "" mail.attachments.each do |attachment| tmp = Tempfile.new("discourse-email-attachment") begin # read attachment File.open(tmp.path, "w+b") { |f| f.write attachment.body.decoded } # create the upload for the user upload = UploadCreator.new(tmp, attachment.filename).create_for(user_id_from_imported_user_id(from_email) || Discourse::SYSTEM_USER_ID) if upload && upload.errors.empty? raw << "\n\n#{receiver.attachment_markdown(upload)}\n\n" end ensure tmp.try(:close!) rescue nil end end user_id = user_id_from_imported_user_id(from_email) || Discourse::SYSTEM_USER_ID raw = selected.force_encoding(selected.encoding).encode("UTF-8") raw = clean_raw(raw) raw = raw.dup.to_s raw.gsub!(/#{from_email}/, "@#{username}") cleaned_email = from_email.dup.sub(/@/, ' at ') raw.gsub!(/#{cleaned_email}/, "@#{username}") { id: t[0], title: clean_title(title), user_id: user_id, created_at: mail.date, category: t[6], raw: clean_raw(raw), cook_method: Post.cook_methods[:email] } end end ensure db.close end def import_replies puts "", "creating topic replies" db = open_db replies = db.execute("SELECT msg_id, from_email, from_name, title, email_date, message, reply_to FROM emails WHERE reply_to IS NOT NULL ORDER BY DATE(email_date) ") post_count = replies.size puts "Replies: #{post_count}" batches(BATCH_SIZE) do |offset| posts = replies[offset..offset + BATCH_SIZE - 1] break if posts.nil? break if posts.count < 1 next if all_records_exist? :posts, posts.map { |p| p[0] } create_posts(posts, total: post_count, offset: offset) do |p| parent_id = p[6] id = p[0] topic = topic_lookup_from_imported_post_id(parent_id) topic_id = topic[:topic_id] if topic next unless topic_id raw_email = p[5] receiver = Email::Receiver.new(raw_email) mail = Mail.read_from_string(raw_email) mail.body from_email, _ = extract_name(mail) selected = receiver.select_body selected = selected.join('') if selected.kind_of?(Array) next unless selected raw = selected.force_encoding(selected.encoding).encode("UTF-8") username = User.find_by_email(from_email).username user_id = user_id_from_imported_user_id(from_email) || Discourse::SYSTEM_USER_ID raw = clean_raw(raw).to_s raw.gsub!(/#{from_email}/, "@#{username}") cleaned_email = from_email.dup.sub(/@/, ' at ') raw.gsub!(/#{cleaned_email}/, "@#{username}") # import the attachments mail.attachments.each do |attachment| tmp = Tempfile.new("discourse-email-attachment") begin # read attachment File.open(tmp.path, "w+b") { |f| f.write attachment.body.decoded } # create the upload for the user upload = UploadCreator.new(tmp, attachment.filename).create_for(user_id_from_imported_user_id(from_email) || Discourse::SYSTEM_USER_ID) if upload && upload.errors.empty? raw << "\n\n#{receiver.attachment_markdown(upload)}\n\n" end ensure tmp.try(:close!) rescue nil end end { id: id, topic_id: topic_id, user_id: user_id_from_imported_user_id(from_email) || Discourse::SYSTEM_USER_ID, created_at: mail.date, raw: clean_raw(raw), cook_method: Post.cook_methods[:email] } end end ensure db.close end end ImportScripts::Mbox.new.perform