mbox: Improve error checking and reporting
This commit is contained in:
parent
3d02dc28be
commit
1886f021e2
|
@ -1,9 +1,6 @@
|
||||||
require 'sqlite3'
|
require 'sqlite3'
|
||||||
require File.expand_path(File.dirname(__FILE__) + "/base.rb")
|
require File.expand_path(File.dirname(__FILE__) + "/base.rb")
|
||||||
|
|
||||||
# TODO: ignore ~ emacs backup files
|
|
||||||
# DONE: sort filenames before processing
|
|
||||||
|
|
||||||
# Paste these lines into your shell before running this:
|
# Paste these lines into your shell before running this:
|
||||||
|
|
||||||
=begin
|
=begin
|
||||||
|
@ -41,6 +38,13 @@ class ImportScripts::Mbox < ImportScripts::Base
|
||||||
# ex: "jobs-folder" => "jobs"
|
# ex: "jobs-folder" => "jobs"
|
||||||
}
|
}
|
||||||
|
|
||||||
|
unless File.directory?(MBOX_DIR)
|
||||||
|
puts "Cannot find import directory #{MBOX_DIR}. Giving up."
|
||||||
|
exit
|
||||||
|
end
|
||||||
|
|
||||||
|
validates_format_of :email, :with => /\A([^@\s]+)@((?:[-a-z0-9]+\.)+[a-z]{2,})\Z/i, :on => :create
|
||||||
|
|
||||||
def execute
|
def execute
|
||||||
import_categories
|
import_categories
|
||||||
create_email_indices
|
create_email_indices
|
||||||
|
@ -49,6 +53,7 @@ class ImportScripts::Mbox < ImportScripts::Base
|
||||||
import_users
|
import_users
|
||||||
create_forum_topics
|
create_forum_topics
|
||||||
import_replies
|
import_replies
|
||||||
|
# replace_email_addresses # uncomment to replace all email address with @username
|
||||||
end
|
end
|
||||||
|
|
||||||
def import_categories
|
def import_categories
|
||||||
|
@ -91,16 +96,21 @@ class ImportScripts::Mbox < ImportScripts::Base
|
||||||
files.sort!
|
files.sort!
|
||||||
|
|
||||||
files.each_with_index do |f, idx|
|
files.each_with_index do |f, idx|
|
||||||
|
print_warning "\nProcessing: #{f}"
|
||||||
|
start_time = Time.now
|
||||||
|
|
||||||
if SPLIT_AT.present?
|
if SPLIT_AT.present?
|
||||||
msg = ""
|
msg = ""
|
||||||
|
message_count = 0
|
||||||
|
|
||||||
each_line(f) do |line|
|
each_line(f) do |line|
|
||||||
line = line.scrub
|
line = line.scrub
|
||||||
if line =~ SPLIT_AT
|
if line =~ SPLIT_AT
|
||||||
|
p message_count += 1
|
||||||
if !msg.empty?
|
if !msg.empty?
|
||||||
mail = Mail.read_from_string(msg)
|
mail = Mail.read_from_string(msg)
|
||||||
yield mail, f
|
yield mail, f
|
||||||
print_status(idx, files.size)
|
print_status(idx, files.size, start_time)
|
||||||
msg = ""
|
msg = ""
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
@ -110,14 +120,14 @@ class ImportScripts::Mbox < ImportScripts::Base
|
||||||
if !msg.empty?
|
if !msg.empty?
|
||||||
mail = Mail.read_from_string(msg)
|
mail = Mail.read_from_string(msg)
|
||||||
yield mail, f
|
yield mail, f
|
||||||
print_status(idx, files.size)
|
print_status(idx, files.size, start_time)
|
||||||
msg = ""
|
msg = ""
|
||||||
end
|
end
|
||||||
else
|
else
|
||||||
raw = File.read(f)
|
raw = File.read(f)
|
||||||
mail = Mail.read_from_string(raw)
|
mail = Mail.read_from_string(raw)
|
||||||
yield mail, f
|
yield mail, f
|
||||||
print_status(idx, files.size)
|
print_status(idx, files.size, start_time)
|
||||||
end
|
end
|
||||||
|
|
||||||
end
|
end
|
||||||
|
@ -133,8 +143,10 @@ class ImportScripts::Mbox < ImportScripts::Base
|
||||||
titles = {}
|
titles = {}
|
||||||
rows.each do |row|
|
rows.each do |row|
|
||||||
msg_ids[row[0]] = true
|
msg_ids[row[0]] = true
|
||||||
|
if titles[row[1]].nil?
|
||||||
titles[row[1]] = row[0]
|
titles[row[1]] = row[0]
|
||||||
end
|
end
|
||||||
|
end
|
||||||
|
|
||||||
# First, any replies where the parent doesn't exist should have that field cleared
|
# First, any replies where the parent doesn't exist should have that field cleared
|
||||||
not_found = []
|
not_found = []
|
||||||
|
@ -171,12 +183,18 @@ class ImportScripts::Mbox < ImportScripts::Base
|
||||||
if mail.from.present?
|
if mail.from.present?
|
||||||
from_email = mail.from.dup
|
from_email = mail.from.dup
|
||||||
if from_email.kind_of?(Array)
|
if from_email.kind_of?(Array)
|
||||||
|
if from_email[0].nil?
|
||||||
|
print_warning "Cannot find email address (ignoring)!\n#{mail}"
|
||||||
|
else
|
||||||
from_email = from_email.first.dup
|
from_email = from_email.first.dup
|
||||||
end
|
|
||||||
|
|
||||||
from_email.gsub!(/ at /, '@')
|
from_email.gsub!(/ at /, '@')
|
||||||
|
from_email.gsub!(/ [at] /, '@')
|
||||||
|
# strip real names in ()s. Todo: read into name
|
||||||
from_email.gsub!(/ \(.*$/, '')
|
from_email.gsub!(/ \(.*$/, '')
|
||||||
|
from_email.gsub!(/ /, '')
|
||||||
end
|
end
|
||||||
|
end
|
||||||
|
p end
|
||||||
|
|
||||||
display_names = from.try(:display_names)
|
display_names = from.try(:display_names)
|
||||||
if display_names.present?
|
if display_names.present?
|
||||||
|
@ -191,6 +209,10 @@ class ImportScripts::Mbox < ImportScripts::Base
|
||||||
[from_email, from_name]
|
[from_email, from_name]
|
||||||
end
|
end
|
||||||
|
|
||||||
|
def print_warning(message)
|
||||||
|
$stderr.puts "#{message}"
|
||||||
|
end
|
||||||
|
|
||||||
def create_email_indices
|
def create_email_indices
|
||||||
db = open_db
|
db = open_db
|
||||||
db.execute "DROP TABLE IF EXISTS emails"
|
db.execute "DROP TABLE IF EXISTS emails"
|
||||||
|
@ -228,6 +250,10 @@ class ImportScripts::Mbox < ImportScripts::Base
|
||||||
email_date = mail['date'].to_s
|
email_date = mail['date'].to_s
|
||||||
email_date = DateTime.parse(email_date).to_s unless email_date.blank?
|
email_date = DateTime.parse(email_date).to_s unless email_date.blank?
|
||||||
|
|
||||||
|
if from_email.kind_of?(String)
|
||||||
|
unless from_email.match(/\A[\w+\-.]+@[a-z\d\-]+(\.[a-z\d\-]+)*\.[a-z]+\z/i)
|
||||||
|
print_warning "Ignoring bad email address #{from_email} in #{msg_id}"
|
||||||
|
else
|
||||||
db.execute "INSERT OR IGNORE INTO emails (msg_id,
|
db.execute "INSERT OR IGNORE INTO emails (msg_id,
|
||||||
from_email,
|
from_email,
|
||||||
from_name,
|
from_name,
|
||||||
|
@ -239,6 +265,8 @@ class ImportScripts::Mbox < ImportScripts::Base
|
||||||
VALUES (?, ?, ?, ?, ?, ?, ?, ?)",
|
VALUES (?, ?, ?, ?, ?, ?, ?, ?)",
|
||||||
[msg_id, from_email, from_name, title, reply_to, email_date, mail.to_s, category]
|
[msg_id, from_email, from_name, title, reply_to, email_date, mail.to_s, category]
|
||||||
end
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
ensure
|
ensure
|
||||||
db.close
|
db.close
|
||||||
end
|
end
|
||||||
|
@ -285,8 +313,8 @@ class ImportScripts::Mbox < ImportScripts::Base
|
||||||
end
|
end
|
||||||
|
|
||||||
def clean_raw(input)
|
def clean_raw(input)
|
||||||
|
|
||||||
raw = input.dup
|
raw = input.dup
|
||||||
|
raw.scrub!
|
||||||
raw.gsub!(/-- \nYou received this message because you are subscribed to the Google Groups "[^"]*" group.\nTo unsubscribe from this group and stop receiving emails from it, send an email to [^+@]+\+unsubscribe@googlegroups.com\.\nFor more options, visit https:\/\/groups\.google\.com\/groups\/opt_out\./, '')
|
raw.gsub!(/-- \nYou received this message because you are subscribed to the Google Groups "[^"]*" group.\nTo unsubscribe from this group and stop receiving emails from it, send an email to [^+@]+\+unsubscribe@googlegroups.com\.\nFor more options, visit https:\/\/groups\.google\.com\/groups\/opt_out\./, '')
|
||||||
|
|
||||||
raw
|
raw
|
||||||
|
@ -411,9 +439,12 @@ class ImportScripts::Mbox < ImportScripts::Base
|
||||||
|
|
||||||
post_count = replies.size
|
post_count = replies.size
|
||||||
|
|
||||||
|
puts "Replies: #{post_count}"
|
||||||
|
|
||||||
batches(BATCH_SIZE) do |offset|
|
batches(BATCH_SIZE) do |offset|
|
||||||
posts = replies[offset..offset+BATCH_SIZE-1]
|
posts = replies[offset..offset+BATCH_SIZE-1]
|
||||||
break if posts.nil?
|
break if posts.nil?
|
||||||
|
break if posts.count < 1
|
||||||
|
|
||||||
next if all_records_exist? :posts, posts.map {|p| p[0]}
|
next if all_records_exist? :posts, posts.map {|p| p[0]}
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue