Improve mbox import script
* emails weren't sorted in correct order * better default regex for splitting mbox files * output Message-ID if email is skipped because it doesn't have a Date
This commit is contained in:
parent
3d176d9984
commit
ac743dab10
|
@ -93,7 +93,10 @@ module ImportScripts::Mbox
|
|||
next if all_records_exist?(:posts, rows.map { |row| row['msg_id'] })
|
||||
|
||||
create_posts(rows, total: total_count, offset: offset) do |row|
|
||||
if row['in_reply_to'].blank?
|
||||
if row['email_date'].blank?
|
||||
puts "Date is missing. Skipping #{row['msg_id']}"
|
||||
nil
|
||||
elsif row['in_reply_to'].blank?
|
||||
map_first_post(row)
|
||||
else
|
||||
map_reply(row)
|
||||
|
@ -163,8 +166,8 @@ module ImportScripts::Mbox
|
|||
)
|
||||
end
|
||||
|
||||
def to_time(datetime)
|
||||
Time.zone.at(DateTime.iso8601(datetime)) if datetime
|
||||
def to_time(timestamp)
|
||||
Time.zone.at(timestamp) if timestamp
|
||||
end
|
||||
end
|
||||
end
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
data_dir: /shared/import/data
|
||||
|
||||
# mbox files
|
||||
split_regex: "^From .+"
|
||||
split_regex: "^From .+@.+"
|
||||
#split_regex: "^From .+@example.com.+"
|
||||
|
||||
# individual emails
|
||||
|
@ -11,7 +11,7 @@ split_regex: "^From .+"
|
|||
#split_regex: "^========================================================================="
|
||||
|
||||
default_trust_level: 1
|
||||
prefer_html: false
|
||||
prefer_html: true
|
||||
staged: true
|
||||
index_only: false
|
||||
|
||||
|
|
|
@ -97,19 +97,20 @@ module ImportScripts::Mbox
|
|||
|
||||
@db.execute <<-SQL
|
||||
WITH RECURSIVE
|
||||
messages(msg_id, level, email_date) AS (
|
||||
SELECT msg_id, 0 AS level, email_date
|
||||
messages(msg_id, level, email_date, in_reply_to) AS (
|
||||
SELECT msg_id, 0 AS level, email_date, in_reply_to
|
||||
FROM email
|
||||
WHERE in_reply_to IS NULL
|
||||
UNION ALL
|
||||
SELECT e.msg_id, m.level + 1, e.email_date
|
||||
SELECT e.msg_id, m.level + 1, e.email_date, e.in_reply_to
|
||||
FROM email e
|
||||
JOIN messages m ON e.in_reply_to = m.msg_id
|
||||
ORDER BY level, email_date, msg_id
|
||||
)
|
||||
INSERT INTO email_order (msg_id)
|
||||
SELECT msg_id
|
||||
FROM messages
|
||||
SELECT c.msg_id
|
||||
FROM messages c
|
||||
LEFT OUTER JOIN messages p ON (c.in_reply_to = p.msg_id)
|
||||
ORDER BY MAX(c.email_date, p.email_date), c.level, c.email_date, c.msg_id
|
||||
SQL
|
||||
end
|
||||
|
||||
|
@ -175,7 +176,6 @@ module ImportScripts::Mbox
|
|||
@db.get_first_value <<-SQL
|
||||
SELECT COUNT(*)
|
||||
FROM email
|
||||
WHERE email_date IS NOT NULL
|
||||
SQL
|
||||
end
|
||||
|
||||
|
@ -185,8 +185,7 @@ module ImportScripts::Mbox
|
|||
raw_message, body, elided, format, attachment_count, category
|
||||
FROM email e
|
||||
JOIN email_order o USING (msg_id)
|
||||
WHERE email_date IS NOT NULL AND
|
||||
o.ROWID > :last_row_id
|
||||
WHERE o.ROWID > :last_row_id
|
||||
ORDER BY o.ROWID
|
||||
LIMIT #{@batch_size}
|
||||
SQL
|
||||
|
|
|
@ -74,7 +74,7 @@ module ImportScripts::Mbox
|
|||
from_email: from_email,
|
||||
from_name: from_display_name,
|
||||
subject: extract_subject(receiver, category_name),
|
||||
email_date: parsed_email.date&.to_s,
|
||||
email_date: timestamp(parsed_email.date),
|
||||
raw_message: receiver.raw_email,
|
||||
body: body,
|
||||
elided: elided,
|
||||
|
@ -256,5 +256,9 @@ module ImportScripts::Mbox
|
|||
def monotonic_time
|
||||
Process.clock_gettime(Process::CLOCK_MONOTONIC)
|
||||
end
|
||||
|
||||
def timestamp(datetime)
|
||||
Time.zone.at(datetime).to_i if datetime
|
||||
end
|
||||
end
|
||||
end
|
||||
|
|
Loading…
Reference in New Issue