Improve mbox import script

* emails weren't sorted in correct order
* better default regex for splitting mbox files
* output Message-ID if email is skipped because it doesn't have a Date
This commit is contained in:
Gerhard Schlager 2018-08-23 09:46:25 +02:00
parent 3d176d9984
commit ac743dab10
4 changed files with 21 additions and 15 deletions

View File

@ -93,7 +93,10 @@ module ImportScripts::Mbox
next if all_records_exist?(:posts, rows.map { |row| row['msg_id'] })
create_posts(rows, total: total_count, offset: offset) do |row|
if row['in_reply_to'].blank?
if row['email_date'].blank?
puts "Date is missing. Skipping #{row['msg_id']}"
nil
elsif row['in_reply_to'].blank?
map_first_post(row)
else
map_reply(row)
@ -163,8 +166,8 @@ module ImportScripts::Mbox
)
end
def to_time(datetime)
Time.zone.at(DateTime.iso8601(datetime)) if datetime
def to_time(timestamp)
Time.zone.at(timestamp) if timestamp
end
end
end

View File

@ -1,7 +1,7 @@
data_dir: /shared/import/data
# mbox files
split_regex: "^From .+"
split_regex: "^From .+@.+"
#split_regex: "^From .+@example.com.+"
# individual emails
@ -11,7 +11,7 @@ split_regex: "^From .+"
#split_regex: "^========================================================================="
default_trust_level: 1
prefer_html: false
prefer_html: true
staged: true
index_only: false

View File

@ -97,19 +97,20 @@ module ImportScripts::Mbox
@db.execute <<-SQL
WITH RECURSIVE
messages(msg_id, level, email_date) AS (
SELECT msg_id, 0 AS level, email_date
messages(msg_id, level, email_date, in_reply_to) AS (
SELECT msg_id, 0 AS level, email_date, in_reply_to
FROM email
WHERE in_reply_to IS NULL
UNION ALL
SELECT e.msg_id, m.level + 1, e.email_date
SELECT e.msg_id, m.level + 1, e.email_date, e.in_reply_to
FROM email e
JOIN messages m ON e.in_reply_to = m.msg_id
ORDER BY level, email_date, msg_id
)
INSERT INTO email_order (msg_id)
SELECT msg_id
FROM messages
SELECT c.msg_id
FROM messages c
LEFT OUTER JOIN messages p ON (c.in_reply_to = p.msg_id)
ORDER BY MAX(c.email_date, p.email_date), c.level, c.email_date, c.msg_id
SQL
end
@ -175,7 +176,6 @@ module ImportScripts::Mbox
@db.get_first_value <<-SQL
SELECT COUNT(*)
FROM email
WHERE email_date IS NOT NULL
SQL
end
@ -185,8 +185,7 @@ module ImportScripts::Mbox
raw_message, body, elided, format, attachment_count, category
FROM email e
JOIN email_order o USING (msg_id)
WHERE email_date IS NOT NULL AND
o.ROWID > :last_row_id
WHERE o.ROWID > :last_row_id
ORDER BY o.ROWID
LIMIT #{@batch_size}
SQL

View File

@ -74,7 +74,7 @@ module ImportScripts::Mbox
from_email: from_email,
from_name: from_display_name,
subject: extract_subject(receiver, category_name),
email_date: parsed_email.date&.to_s,
email_date: timestamp(parsed_email.date),
raw_message: receiver.raw_email,
body: body,
elided: elided,
@ -256,5 +256,9 @@ module ImportScripts::Mbox
def monotonic_time
Process.clock_gettime(Process::CLOCK_MONOTONIC)
end
def timestamp(datetime)
Time.zone.at(datetime).to_i if datetime
end
end
end