mbox: Improve error checking and reporting

This commit is contained in:
Jay Pfaffman 2016-10-07 09:38:54 -07:00
parent 3d02dc28be
commit 1886f021e2
1 changed files with 46 additions and 15 deletions

View File

@ -1,9 +1,6 @@
require 'sqlite3'
require File.expand_path(File.dirname(__FILE__) + "/base.rb")
# TODO: ignore ~ emacs backup files
# DONE: sort filenames before processing
# Paste these lines into your shell before running this:
=begin
@ -41,6 +38,13 @@ class ImportScripts::Mbox < ImportScripts::Base
# ex: "jobs-folder" => "jobs"
}
unless File.directory?(MBOX_DIR)
puts "Cannot find import directory #{MBOX_DIR}. Giving up."
exit
end
validates_format_of :email, :with => /\A([^@\s]+)@((?:[-a-z0-9]+\.)+[a-z]{2,})\Z/i, :on => :create
def execute
import_categories
create_email_indices
@ -49,6 +53,7 @@ class ImportScripts::Mbox < ImportScripts::Base
import_users
create_forum_topics
import_replies
# replace_email_addresses # uncomment to replace all email address with @username
end
def import_categories
@ -91,16 +96,21 @@ class ImportScripts::Mbox < ImportScripts::Base
files.sort!
files.each_with_index do |f, idx|
print_warning "\nProcessing: #{f}"
start_time = Time.now
if SPLIT_AT.present?
msg = ""
message_count = 0
each_line(f) do |line|
line = line.scrub
if line =~ SPLIT_AT
p message_count += 1
if !msg.empty?
mail = Mail.read_from_string(msg)
yield mail, f
print_status(idx, files.size)
print_status(idx, files.size, start_time)
msg = ""
end
end
@ -110,14 +120,14 @@ class ImportScripts::Mbox < ImportScripts::Base
if !msg.empty?
mail = Mail.read_from_string(msg)
yield mail, f
print_status(idx, files.size)
print_status(idx, files.size, start_time)
msg = ""
end
else
raw = File.read(f)
mail = Mail.read_from_string(raw)
yield mail, f
print_status(idx, files.size)
print_status(idx, files.size, start_time)
end
end
@ -133,8 +143,10 @@ class ImportScripts::Mbox < ImportScripts::Base
titles = {}
rows.each do |row|
msg_ids[row[0]] = true
if titles[row[1]].nil?
titles[row[1]] = row[0]
end
end
# First, any replies where the parent doesn't exist should have that field cleared
not_found = []
@ -171,12 +183,18 @@ class ImportScripts::Mbox < ImportScripts::Base
if mail.from.present?
from_email = mail.from.dup
if from_email.kind_of?(Array)
if from_email[0].nil?
print_warning "Cannot find email address (ignoring)!\n#{mail}"
else
from_email = from_email.first.dup
end
from_email.gsub!(/ at /, '@')
from_email.gsub!(/ [at] /, '@')
# strip real names in ()s. Todo: read into name
from_email.gsub!(/ \(.*$/, '')
from_email.gsub!(/ /, '')
end
end
p end
display_names = from.try(:display_names)
if display_names.present?
@ -191,6 +209,10 @@ class ImportScripts::Mbox < ImportScripts::Base
[from_email, from_name]
end
def print_warning(message)
$stderr.puts "#{message}"
end
def create_email_indices
db = open_db
db.execute "DROP TABLE IF EXISTS emails"
@ -228,6 +250,10 @@ class ImportScripts::Mbox < ImportScripts::Base
email_date = mail['date'].to_s
email_date = DateTime.parse(email_date).to_s unless email_date.blank?
if from_email.kind_of?(String)
unless from_email.match(/\A[\w+\-.]+@[a-z\d\-]+(\.[a-z\d\-]+)*\.[a-z]+\z/i)
print_warning "Ignoring bad email address #{from_email} in #{msg_id}"
else
db.execute "INSERT OR IGNORE INTO emails (msg_id,
from_email,
from_name,
@ -239,6 +265,8 @@ class ImportScripts::Mbox < ImportScripts::Base
VALUES (?, ?, ?, ?, ?, ?, ?, ?)",
[msg_id, from_email, from_name, title, reply_to, email_date, mail.to_s, category]
end
end
end
ensure
db.close
end
@ -285,8 +313,8 @@ class ImportScripts::Mbox < ImportScripts::Base
end
def clean_raw(input)
raw = input.dup
raw.scrub!
raw.gsub!(/-- \nYou received this message because you are subscribed to the Google Groups "[^"]*" group.\nTo unsubscribe from this group and stop receiving emails from it, send an email to [^+@]+\+unsubscribe@googlegroups.com\.\nFor more options, visit https:\/\/groups\.google\.com\/groups\/opt_out\./, '')
raw
@ -411,9 +439,12 @@ class ImportScripts::Mbox < ImportScripts::Base
post_count = replies.size
puts "Replies: #{post_count}"
batches(BATCH_SIZE) do |offset|
posts = replies[offset..offset+BATCH_SIZE-1]
break if posts.nil?
break if posts.count < 1
next if all_records_exist? :posts, posts.map {|p| p[0]}