Improve mbox import script

* emails weren't sorted in correct order
* better default regex for splitting mbox files
* output Message-ID if email is skipped because it doesn't have a Date
This commit is contained in:
Gerhard Schlager 2018-08-23 09:46:25 +02:00
parent 3d176d9984
commit ac743dab10
4 changed files with 21 additions and 15 deletions

View File

@ -93,7 +93,10 @@ module ImportScripts::Mbox
next if all_records_exist?(:posts, rows.map { |row| row['msg_id'] }) next if all_records_exist?(:posts, rows.map { |row| row['msg_id'] })
create_posts(rows, total: total_count, offset: offset) do |row| create_posts(rows, total: total_count, offset: offset) do |row|
if row['in_reply_to'].blank? if row['email_date'].blank?
puts "Date is missing. Skipping #{row['msg_id']}"
nil
elsif row['in_reply_to'].blank?
map_first_post(row) map_first_post(row)
else else
map_reply(row) map_reply(row)
@ -163,8 +166,8 @@ module ImportScripts::Mbox
) )
end end
def to_time(datetime) def to_time(timestamp)
Time.zone.at(DateTime.iso8601(datetime)) if datetime Time.zone.at(timestamp) if timestamp
end end
end end
end end

View File

@ -1,7 +1,7 @@
data_dir: /shared/import/data data_dir: /shared/import/data
# mbox files # mbox files
split_regex: "^From .+" split_regex: "^From .+@.+"
#split_regex: "^From .+@example.com.+" #split_regex: "^From .+@example.com.+"
# individual emails # individual emails
@ -11,7 +11,7 @@ split_regex: "^From .+"
#split_regex: "^=========================================================================" #split_regex: "^========================================================================="
default_trust_level: 1 default_trust_level: 1
prefer_html: false prefer_html: true
staged: true staged: true
index_only: false index_only: false

View File

@ -97,19 +97,20 @@ module ImportScripts::Mbox
@db.execute <<-SQL @db.execute <<-SQL
WITH RECURSIVE WITH RECURSIVE
messages(msg_id, level, email_date) AS ( messages(msg_id, level, email_date, in_reply_to) AS (
SELECT msg_id, 0 AS level, email_date SELECT msg_id, 0 AS level, email_date, in_reply_to
FROM email FROM email
WHERE in_reply_to IS NULL WHERE in_reply_to IS NULL
UNION ALL UNION ALL
SELECT e.msg_id, m.level + 1, e.email_date SELECT e.msg_id, m.level + 1, e.email_date, e.in_reply_to
FROM email e FROM email e
JOIN messages m ON e.in_reply_to = m.msg_id JOIN messages m ON e.in_reply_to = m.msg_id
ORDER BY level, email_date, msg_id
) )
INSERT INTO email_order (msg_id) INSERT INTO email_order (msg_id)
SELECT msg_id SELECT c.msg_id
FROM messages FROM messages c
LEFT OUTER JOIN messages p ON (c.in_reply_to = p.msg_id)
ORDER BY MAX(c.email_date, p.email_date), c.level, c.email_date, c.msg_id
SQL SQL
end end
@ -175,7 +176,6 @@ module ImportScripts::Mbox
@db.get_first_value <<-SQL @db.get_first_value <<-SQL
SELECT COUNT(*) SELECT COUNT(*)
FROM email FROM email
WHERE email_date IS NOT NULL
SQL SQL
end end
@ -185,8 +185,7 @@ module ImportScripts::Mbox
raw_message, body, elided, format, attachment_count, category raw_message, body, elided, format, attachment_count, category
FROM email e FROM email e
JOIN email_order o USING (msg_id) JOIN email_order o USING (msg_id)
WHERE email_date IS NOT NULL AND WHERE o.ROWID > :last_row_id
o.ROWID > :last_row_id
ORDER BY o.ROWID ORDER BY o.ROWID
LIMIT #{@batch_size} LIMIT #{@batch_size}
SQL SQL

View File

@ -74,7 +74,7 @@ module ImportScripts::Mbox
from_email: from_email, from_email: from_email,
from_name: from_display_name, from_name: from_display_name,
subject: extract_subject(receiver, category_name), subject: extract_subject(receiver, category_name),
email_date: parsed_email.date&.to_s, email_date: timestamp(parsed_email.date),
raw_message: receiver.raw_email, raw_message: receiver.raw_email,
body: body, body: body,
elided: elided, elided: elided,
@ -256,5 +256,9 @@ module ImportScripts::Mbox
def monotonic_time def monotonic_time
Process.clock_gettime(Process::CLOCK_MONOTONIC) Process.clock_gettime(Process::CLOCK_MONOTONIC)
end end
def timestamp(datetime)
Time.zone.at(datetime).to_i if datetime
end
end end
end end