Improve mbox import script
* emails weren't sorted in correct order * better default regex for splitting mbox files * output Message-ID if email is skipped because it doesn't have a Date
This commit is contained in:
parent
3d176d9984
commit
ac743dab10
|
@ -93,7 +93,10 @@ module ImportScripts::Mbox
|
||||||
next if all_records_exist?(:posts, rows.map { |row| row['msg_id'] })
|
next if all_records_exist?(:posts, rows.map { |row| row['msg_id'] })
|
||||||
|
|
||||||
create_posts(rows, total: total_count, offset: offset) do |row|
|
create_posts(rows, total: total_count, offset: offset) do |row|
|
||||||
if row['in_reply_to'].blank?
|
if row['email_date'].blank?
|
||||||
|
puts "Date is missing. Skipping #{row['msg_id']}"
|
||||||
|
nil
|
||||||
|
elsif row['in_reply_to'].blank?
|
||||||
map_first_post(row)
|
map_first_post(row)
|
||||||
else
|
else
|
||||||
map_reply(row)
|
map_reply(row)
|
||||||
|
@ -163,8 +166,8 @@ module ImportScripts::Mbox
|
||||||
)
|
)
|
||||||
end
|
end
|
||||||
|
|
||||||
def to_time(datetime)
|
def to_time(timestamp)
|
||||||
Time.zone.at(DateTime.iso8601(datetime)) if datetime
|
Time.zone.at(timestamp) if timestamp
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
data_dir: /shared/import/data
|
data_dir: /shared/import/data
|
||||||
|
|
||||||
# mbox files
|
# mbox files
|
||||||
split_regex: "^From .+"
|
split_regex: "^From .+@.+"
|
||||||
#split_regex: "^From .+@example.com.+"
|
#split_regex: "^From .+@example.com.+"
|
||||||
|
|
||||||
# individual emails
|
# individual emails
|
||||||
|
@ -11,7 +11,7 @@ split_regex: "^From .+"
|
||||||
#split_regex: "^========================================================================="
|
#split_regex: "^========================================================================="
|
||||||
|
|
||||||
default_trust_level: 1
|
default_trust_level: 1
|
||||||
prefer_html: false
|
prefer_html: true
|
||||||
staged: true
|
staged: true
|
||||||
index_only: false
|
index_only: false
|
||||||
|
|
||||||
|
|
|
@ -97,19 +97,20 @@ module ImportScripts::Mbox
|
||||||
|
|
||||||
@db.execute <<-SQL
|
@db.execute <<-SQL
|
||||||
WITH RECURSIVE
|
WITH RECURSIVE
|
||||||
messages(msg_id, level, email_date) AS (
|
messages(msg_id, level, email_date, in_reply_to) AS (
|
||||||
SELECT msg_id, 0 AS level, email_date
|
SELECT msg_id, 0 AS level, email_date, in_reply_to
|
||||||
FROM email
|
FROM email
|
||||||
WHERE in_reply_to IS NULL
|
WHERE in_reply_to IS NULL
|
||||||
UNION ALL
|
UNION ALL
|
||||||
SELECT e.msg_id, m.level + 1, e.email_date
|
SELECT e.msg_id, m.level + 1, e.email_date, e.in_reply_to
|
||||||
FROM email e
|
FROM email e
|
||||||
JOIN messages m ON e.in_reply_to = m.msg_id
|
JOIN messages m ON e.in_reply_to = m.msg_id
|
||||||
ORDER BY level, email_date, msg_id
|
|
||||||
)
|
)
|
||||||
INSERT INTO email_order (msg_id)
|
INSERT INTO email_order (msg_id)
|
||||||
SELECT msg_id
|
SELECT c.msg_id
|
||||||
FROM messages
|
FROM messages c
|
||||||
|
LEFT OUTER JOIN messages p ON (c.in_reply_to = p.msg_id)
|
||||||
|
ORDER BY MAX(c.email_date, p.email_date), c.level, c.email_date, c.msg_id
|
||||||
SQL
|
SQL
|
||||||
end
|
end
|
||||||
|
|
||||||
|
@ -175,7 +176,6 @@ module ImportScripts::Mbox
|
||||||
@db.get_first_value <<-SQL
|
@db.get_first_value <<-SQL
|
||||||
SELECT COUNT(*)
|
SELECT COUNT(*)
|
||||||
FROM email
|
FROM email
|
||||||
WHERE email_date IS NOT NULL
|
|
||||||
SQL
|
SQL
|
||||||
end
|
end
|
||||||
|
|
||||||
|
@ -185,8 +185,7 @@ module ImportScripts::Mbox
|
||||||
raw_message, body, elided, format, attachment_count, category
|
raw_message, body, elided, format, attachment_count, category
|
||||||
FROM email e
|
FROM email e
|
||||||
JOIN email_order o USING (msg_id)
|
JOIN email_order o USING (msg_id)
|
||||||
WHERE email_date IS NOT NULL AND
|
WHERE o.ROWID > :last_row_id
|
||||||
o.ROWID > :last_row_id
|
|
||||||
ORDER BY o.ROWID
|
ORDER BY o.ROWID
|
||||||
LIMIT #{@batch_size}
|
LIMIT #{@batch_size}
|
||||||
SQL
|
SQL
|
||||||
|
|
|
@ -74,7 +74,7 @@ module ImportScripts::Mbox
|
||||||
from_email: from_email,
|
from_email: from_email,
|
||||||
from_name: from_display_name,
|
from_name: from_display_name,
|
||||||
subject: extract_subject(receiver, category_name),
|
subject: extract_subject(receiver, category_name),
|
||||||
email_date: parsed_email.date&.to_s,
|
email_date: timestamp(parsed_email.date),
|
||||||
raw_message: receiver.raw_email,
|
raw_message: receiver.raw_email,
|
||||||
body: body,
|
body: body,
|
||||||
elided: elided,
|
elided: elided,
|
||||||
|
@ -256,5 +256,9 @@ module ImportScripts::Mbox
|
||||||
def monotonic_time
|
def monotonic_time
|
||||||
Process.clock_gettime(Process::CLOCK_MONOTONIC)
|
Process.clock_gettime(Process::CLOCK_MONOTONIC)
|
||||||
end
|
end
|
||||||
|
|
||||||
|
def timestamp(datetime)
|
||||||
|
Time.zone.at(datetime).to_i if datetime
|
||||||
|
end
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
Loading…
Reference in New Issue