Improvements to mbox importer
* store time it took to index message in DB (to find performance issues) * ignore listserv specific files * better examples for split_regex * first email in mbox shouldn't contain the split string * always lock the DB in exclusive mode * save email within transaction * messages can be grouped by subject and use original order (for Listserv) * adds option to index emails without running the import
This commit is contained in:
parent
5d7a33cd6d
commit
bb54eb1192
|
@ -239,7 +239,7 @@ module Email
|
||||||
|
|
||||||
if text.present?
|
if text.present?
|
||||||
text = trim_discourse_markers(text)
|
text = trim_discourse_markers(text)
|
||||||
text, elided_text = EmailReplyTrimmer.trim(text, true)
|
text, elided_text = trim_reply_and_extract_elided(text)
|
||||||
|
|
||||||
if @opts[:convert_plaintext] || sent_to_mailinglist_mirror?
|
if @opts[:convert_plaintext] || sent_to_mailinglist_mirror?
|
||||||
text_content_type ||= ""
|
text_content_type ||= ""
|
||||||
|
@ -255,7 +255,7 @@ module Email
|
||||||
markdown, elided_markdown = if html.present?
|
markdown, elided_markdown = if html.present?
|
||||||
markdown = HtmlToMarkdown.new(html, keep_img_tags: true, keep_cid_imgs: true).to_markdown
|
markdown = HtmlToMarkdown.new(html, keep_img_tags: true, keep_cid_imgs: true).to_markdown
|
||||||
markdown = trim_discourse_markers(markdown)
|
markdown = trim_discourse_markers(markdown)
|
||||||
EmailReplyTrimmer.trim(markdown, true)
|
trim_reply_and_extract_elided(markdown)
|
||||||
end
|
end
|
||||||
|
|
||||||
if text.blank? || (SiteSetting.incoming_email_prefer_html && markdown.present?)
|
if text.blank? || (SiteSetting.incoming_email_prefer_html && markdown.present?)
|
||||||
|
@ -265,6 +265,11 @@ module Email
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
|
def trim_reply_and_extract_elided(text)
|
||||||
|
return [text, ""] if @opts[:skip_trimming]
|
||||||
|
EmailReplyTrimmer.trim(text, true)
|
||||||
|
end
|
||||||
|
|
||||||
def fix_charset(mail_part)
|
def fix_charset(mail_part)
|
||||||
return nil if mail_part.blank? || mail_part.body.blank?
|
return nil if mail_part.blank? || mail_part.body.blank?
|
||||||
|
|
||||||
|
|
|
@ -31,6 +31,7 @@ class ImportScripts::Base
|
||||||
@site_settings_during_import = {}
|
@site_settings_during_import = {}
|
||||||
@old_site_settings = {}
|
@old_site_settings = {}
|
||||||
@start_times = { import: Time.now }
|
@start_times = { import: Time.now }
|
||||||
|
@skip_updates = false
|
||||||
end
|
end
|
||||||
|
|
||||||
def preload_i18n
|
def preload_i18n
|
||||||
|
@ -46,6 +47,7 @@ class ImportScripts::Base
|
||||||
|
|
||||||
puts ""
|
puts ""
|
||||||
|
|
||||||
|
unless @skip_updates
|
||||||
update_bumped_at
|
update_bumped_at
|
||||||
update_last_posted_at
|
update_last_posted_at
|
||||||
update_last_seen_at
|
update_last_seen_at
|
||||||
|
@ -54,6 +56,7 @@ class ImportScripts::Base
|
||||||
update_category_featured_topics
|
update_category_featured_topics
|
||||||
update_topic_count_replies
|
update_topic_count_replies
|
||||||
reset_topic_counters
|
reset_topic_counters
|
||||||
|
end
|
||||||
|
|
||||||
elapsed = Time.now - @start_times[:import]
|
elapsed = Time.now - @start_times[:import]
|
||||||
puts '', '', 'Done (%02dh %02dmin %02dsec)' % [elapsed / 3600, elapsed / 60 % 60, elapsed % 60]
|
puts '', '', 'Done (%02dh %02dmin %02dsec)' % [elapsed / 3600, elapsed / 60 % 60, elapsed % 60]
|
||||||
|
|
|
@ -24,10 +24,15 @@ module ImportScripts::Mbox
|
||||||
|
|
||||||
def execute
|
def execute
|
||||||
index_messages
|
index_messages
|
||||||
|
|
||||||
|
if @settings.index_only
|
||||||
|
@skip_updates = true
|
||||||
|
else
|
||||||
import_categories
|
import_categories
|
||||||
import_users
|
import_users
|
||||||
import_posts
|
import_posts
|
||||||
end
|
end
|
||||||
|
end
|
||||||
|
|
||||||
def index_messages
|
def index_messages
|
||||||
puts '', 'creating index'
|
puts '', 'creating index'
|
||||||
|
|
|
@ -1,11 +1,18 @@
|
||||||
# PostgreSQL mailing lists
|
|
||||||
#data_dir: /shared/import/data
|
|
||||||
#split_regex: "^From .*@postgresql.org.*"
|
|
||||||
|
|
||||||
# ruby-talk mailing list
|
|
||||||
data_dir: /shared/import/data
|
data_dir: /shared/import/data
|
||||||
split_regex: ""
|
|
||||||
|
# mbox files
|
||||||
|
split_regex: "^From .+"
|
||||||
|
#split_regex: "^From .+@example.com.+"
|
||||||
|
|
||||||
|
# individual emails
|
||||||
|
#split_regex: ""
|
||||||
|
|
||||||
|
# Listserv files
|
||||||
|
#split_regex: "^========================================================================="
|
||||||
|
|
||||||
default_trust_level: 1
|
default_trust_level: 1
|
||||||
prefer_html: false
|
prefer_html: false
|
||||||
staged: true
|
staged: true
|
||||||
|
index_only: false
|
||||||
|
|
||||||
|
group_messages_by_subject: false
|
||||||
|
|
|
@ -2,7 +2,7 @@ require 'sqlite3'
|
||||||
|
|
||||||
module ImportScripts::Mbox
|
module ImportScripts::Mbox
|
||||||
class Database
|
class Database
|
||||||
SCHEMA_VERSION = 1
|
SCHEMA_VERSION = 2
|
||||||
|
|
||||||
def initialize(directory, batch_size)
|
def initialize(directory, batch_size)
|
||||||
@db = SQLite3::Database.new("#{directory}/index.db", results_as_hash: true)
|
@db = SQLite3::Database.new("#{directory}/index.db", results_as_hash: true)
|
||||||
|
@ -17,6 +17,15 @@ module ImportScripts::Mbox
|
||||||
create_table_for_users
|
create_table_for_users
|
||||||
end
|
end
|
||||||
|
|
||||||
|
def transaction
|
||||||
|
@db.transaction
|
||||||
|
yield self
|
||||||
|
@db.commit
|
||||||
|
|
||||||
|
rescue
|
||||||
|
@db.rollback
|
||||||
|
end
|
||||||
|
|
||||||
def insert_category(category)
|
def insert_category(category)
|
||||||
@db.execute(<<-SQL, category)
|
@db.execute(<<-SQL, category)
|
||||||
INSERT OR REPLACE INTO category (name, description)
|
INSERT OR REPLACE INTO category (name, description)
|
||||||
|
@ -35,10 +44,10 @@ module ImportScripts::Mbox
|
||||||
@db.execute(<<-SQL, email)
|
@db.execute(<<-SQL, email)
|
||||||
INSERT OR REPLACE INTO email (msg_id, from_email, from_name, subject,
|
INSERT OR REPLACE INTO email (msg_id, from_email, from_name, subject,
|
||||||
email_date, raw_message, body, elided, format, attachment_count, charset,
|
email_date, raw_message, body, elided, format, attachment_count, charset,
|
||||||
category, filename, first_line_number, last_line_number)
|
category, filename, first_line_number, last_line_number, index_duration)
|
||||||
VALUES (:msg_id, :from_email, :from_name, :subject,
|
VALUES (:msg_id, :from_email, :from_name, :subject,
|
||||||
:email_date, :raw_message, :body, :elided, :format, :attachment_count, :charset,
|
:email_date, :raw_message, :body, :elided, :format, :attachment_count, :charset,
|
||||||
:category, :filename, :first_line_number, :last_line_number)
|
:category, :filename, :first_line_number, :last_line_number, :index_duration)
|
||||||
SQL
|
SQL
|
||||||
end
|
end
|
||||||
|
|
||||||
|
@ -69,7 +78,21 @@ module ImportScripts::Mbox
|
||||||
SQL
|
SQL
|
||||||
end
|
end
|
||||||
|
|
||||||
def sort_emails
|
def update_in_reply_to_by_email_subject
|
||||||
|
@db.execute <<-SQL
|
||||||
|
UPDATE email
|
||||||
|
SET in_reply_to = NULLIF((
|
||||||
|
SELECT e.msg_id
|
||||||
|
FROM email e
|
||||||
|
JOIN email_order o ON (e.msg_id = o.msg_id)
|
||||||
|
WHERE e.subject = email.subject
|
||||||
|
ORDER BY o.ROWID
|
||||||
|
LIMIT 1
|
||||||
|
), msg_id)
|
||||||
|
SQL
|
||||||
|
end
|
||||||
|
|
||||||
|
def sort_emails_by_date_and_reply_level
|
||||||
@db.execute 'DELETE FROM email_order'
|
@db.execute 'DELETE FROM email_order'
|
||||||
|
|
||||||
@db.execute <<-SQL
|
@db.execute <<-SQL
|
||||||
|
@ -90,6 +113,17 @@ module ImportScripts::Mbox
|
||||||
SQL
|
SQL
|
||||||
end
|
end
|
||||||
|
|
||||||
|
def sort_emails_by_subject
|
||||||
|
@db.execute 'DELETE FROM email_order'
|
||||||
|
|
||||||
|
@db.execute <<-SQL
|
||||||
|
INSERT INTO email_order (msg_id)
|
||||||
|
SELECT msg_id
|
||||||
|
FROM email
|
||||||
|
ORDER BY subject, filename, ROWID
|
||||||
|
SQL
|
||||||
|
end
|
||||||
|
|
||||||
def fill_users_from_emails
|
def fill_users_from_emails
|
||||||
@db.execute 'DELETE FROM user'
|
@db.execute 'DELETE FROM user'
|
||||||
|
|
||||||
|
@ -164,10 +198,17 @@ module ImportScripts::Mbox
|
||||||
|
|
||||||
def configure_database
|
def configure_database
|
||||||
@db.execute 'PRAGMA journal_mode = OFF'
|
@db.execute 'PRAGMA journal_mode = OFF'
|
||||||
|
@db.execute 'PRAGMA locking_mode = EXCLUSIVE'
|
||||||
end
|
end
|
||||||
|
|
||||||
def upgrade_schema_version
|
def upgrade_schema_version
|
||||||
# current_version = query("PRAGMA user_version").last[0]
|
current_version = @db.get_first_value("PRAGMA user_version")
|
||||||
|
|
||||||
|
case current_version
|
||||||
|
when 1
|
||||||
|
@db.execute "ALTER TABLE email ADD COLUMN index_duration REAL"
|
||||||
|
end
|
||||||
|
|
||||||
@db.execute "PRAGMA user_version = #{SCHEMA_VERSION}"
|
@db.execute "PRAGMA user_version = #{SCHEMA_VERSION}"
|
||||||
end
|
end
|
||||||
|
|
||||||
|
@ -211,11 +252,13 @@ module ImportScripts::Mbox
|
||||||
filename TEXT NOT NULL,
|
filename TEXT NOT NULL,
|
||||||
first_line_number INTEGER,
|
first_line_number INTEGER,
|
||||||
last_line_number INTEGER,
|
last_line_number INTEGER,
|
||||||
|
index_duration REAL,
|
||||||
FOREIGN KEY(category) REFERENCES category(name)
|
FOREIGN KEY(category) REFERENCES category(name)
|
||||||
)
|
)
|
||||||
SQL
|
SQL
|
||||||
|
|
||||||
@db.execute 'CREATE INDEX IF NOT EXISTS email_by_from ON email (from_email)'
|
@db.execute 'CREATE INDEX IF NOT EXISTS email_by_from ON email (from_email)'
|
||||||
|
@db.execute 'CREATE INDEX IF NOT EXISTS email_by_subject ON email (subject)'
|
||||||
@db.execute 'CREATE INDEX IF NOT EXISTS email_by_in_reply_to ON email (in_reply_to)'
|
@db.execute 'CREATE INDEX IF NOT EXISTS email_by_in_reply_to ON email (in_reply_to)'
|
||||||
@db.execute 'CREATE INDEX IF NOT EXISTS email_by_date ON email (email_date)'
|
@db.execute 'CREATE INDEX IF NOT EXISTS email_by_date ON email (email_date)'
|
||||||
|
|
||||||
|
|
|
@ -8,12 +8,12 @@ module ImportScripts::Mbox
|
||||||
# @param settings [ImportScripts::Mbox::Settings]
|
# @param settings [ImportScripts::Mbox::Settings]
|
||||||
def initialize(database, settings)
|
def initialize(database, settings)
|
||||||
@database = database
|
@database = database
|
||||||
@root_directory = settings.data_dir
|
@settings = settings
|
||||||
@split_regex = settings.split_regex
|
@split_regex = settings.split_regex
|
||||||
end
|
end
|
||||||
|
|
||||||
def execute
|
def execute
|
||||||
directories = Dir.glob(File.join(@root_directory, '*'))
|
directories = Dir.glob(File.join(@settings.data_dir, '*'))
|
||||||
directories.select! { |f| File.directory?(f) }
|
directories.select! { |f| File.directory?(f) }
|
||||||
directories.sort!
|
directories.sort!
|
||||||
|
|
||||||
|
@ -24,14 +24,21 @@ module ImportScripts::Mbox
|
||||||
end
|
end
|
||||||
|
|
||||||
puts '', 'indexing replies and users'
|
puts '', 'indexing replies and users'
|
||||||
|
if @settings.group_messages_by_subject
|
||||||
|
@database.sort_emails_by_subject
|
||||||
|
@database.update_in_reply_to_by_email_subject
|
||||||
|
else
|
||||||
@database.update_in_reply_to_of_emails
|
@database.update_in_reply_to_of_emails
|
||||||
@database.sort_emails
|
@database.sort_emails_by_date_and_reply_level
|
||||||
|
end
|
||||||
|
|
||||||
@database.fill_users_from_emails
|
@database.fill_users_from_emails
|
||||||
end
|
end
|
||||||
|
|
||||||
private
|
private
|
||||||
|
|
||||||
METADATA_FILENAME = 'metadata.yml'.freeze
|
METADATA_FILENAME = 'metadata.yml'.freeze
|
||||||
|
IGNORED_FILE_EXTENSIONS = ['.dbindex', '.dbnames', '.digest', '.subjects']
|
||||||
|
|
||||||
def index_category(directory)
|
def index_category(directory)
|
||||||
metadata_file = File.join(directory, METADATA_FILENAME)
|
metadata_file = File.join(directory, METADATA_FILENAME)
|
||||||
|
@ -54,7 +61,7 @@ module ImportScripts::Mbox
|
||||||
end
|
end
|
||||||
|
|
||||||
def index_emails(directory, category_name)
|
def index_emails(directory, category_name)
|
||||||
all_messages(directory, category_name) do |receiver, filename, first_line_number, last_line_number|
|
all_messages(directory, category_name) do |receiver, filename, opts|
|
||||||
msg_id = receiver.message_id
|
msg_id = receiver.message_id
|
||||||
parsed_email = receiver.mail
|
parsed_email = receiver.mail
|
||||||
from_email, from_display_name = receiver.parse_from_field(parsed_email)
|
from_email, from_display_name = receiver.parse_from_field(parsed_email)
|
||||||
|
@ -75,12 +82,15 @@ module ImportScripts::Mbox
|
||||||
charset: parsed_email.charset&.downcase,
|
charset: parsed_email.charset&.downcase,
|
||||||
category: category_name,
|
category: category_name,
|
||||||
filename: File.basename(filename),
|
filename: File.basename(filename),
|
||||||
first_line_number: first_line_number,
|
first_line_number: opts[:first_line_number],
|
||||||
last_line_number: last_line_number
|
last_line_number: opts[:last_line_number],
|
||||||
|
index_duration: (monotonic_time - opts[:start_time]).round(4)
|
||||||
}
|
}
|
||||||
|
|
||||||
@database.insert_email(email)
|
@database.transaction do |db|
|
||||||
@database.insert_replies(msg_id, reply_message_ids) unless reply_message_ids.empty?
|
db.insert_email(email)
|
||||||
|
db.insert_replies(msg_id, reply_message_ids) unless reply_message_ids.empty?
|
||||||
|
end
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
|
@ -102,12 +112,18 @@ module ImportScripts::Mbox
|
||||||
|
|
||||||
if @split_regex.present?
|
if @split_regex.present?
|
||||||
each_mail(filename) do |raw_message, first_line_number, last_line_number|
|
each_mail(filename) do |raw_message, first_line_number, last_line_number|
|
||||||
|
opts = {
|
||||||
|
first_line_number: first_line_number,
|
||||||
|
last_line_number: last_line_number,
|
||||||
|
start_time: monotonic_time
|
||||||
|
}
|
||||||
receiver = read_mail_from_string(raw_message)
|
receiver = read_mail_from_string(raw_message)
|
||||||
yield receiver, filename, first_line_number, last_line_number if receiver.present?
|
yield receiver, filename, opts if receiver.present?
|
||||||
end
|
end
|
||||||
else
|
else
|
||||||
|
opts = { start_time: monotonic_time }
|
||||||
receiver = read_mail_from_file(filename)
|
receiver = read_mail_from_file(filename)
|
||||||
yield receiver, filename if receiver.present?
|
yield receiver, filename, opts if receiver.present?
|
||||||
end
|
end
|
||||||
|
|
||||||
mark_as_fully_indexed(category_name, filename)
|
mark_as_fully_indexed(category_name, filename)
|
||||||
|
@ -132,10 +148,12 @@ module ImportScripts::Mbox
|
||||||
each_line(filename) do |line|
|
each_line(filename) do |line|
|
||||||
line = line.scrub
|
line = line.scrub
|
||||||
|
|
||||||
if line =~ @split_regex && last_line_number.positive?
|
if line =~ @split_regex
|
||||||
|
if last_line_number > 0
|
||||||
yield raw_message, first_line_number, last_line_number
|
yield raw_message, first_line_number, last_line_number
|
||||||
raw_message = ''
|
raw_message = ''
|
||||||
first_line_number = last_line_number + 1
|
first_line_number = last_line_number + 1
|
||||||
|
end
|
||||||
else
|
else
|
||||||
raw_message << line
|
raw_message << line
|
||||||
end
|
end
|
||||||
|
@ -163,7 +181,7 @@ module ImportScripts::Mbox
|
||||||
end
|
end
|
||||||
|
|
||||||
def read_mail_from_string(raw_message)
|
def read_mail_from_string(raw_message)
|
||||||
Email::Receiver.new(raw_message, convert_plaintext: true) unless raw_message.blank?
|
Email::Receiver.new(raw_message, convert_plaintext: true, skip_trimming: false) unless raw_message.blank?
|
||||||
end
|
end
|
||||||
|
|
||||||
def extract_reply_message_ids(mail)
|
def extract_reply_message_ids(mail)
|
||||||
|
@ -210,16 +228,12 @@ module ImportScripts::Mbox
|
||||||
end
|
end
|
||||||
|
|
||||||
def ignored_file?(filename, checksums)
|
def ignored_file?(filename, checksums)
|
||||||
File.directory?(filename) || hidden_file?(filename) ||
|
filename = File.basename(filename)
|
||||||
metadata_file?(filename) || fully_indexed?(filename, checksums)
|
|
||||||
end
|
|
||||||
|
|
||||||
def hidden_file?(filename)
|
filename.start_with?('.') ||
|
||||||
File.basename(filename).start_with?('.')
|
filename == METADATA_FILENAME ||
|
||||||
end
|
IGNORED_FILE_EXTENSIONS.include?(File.extname(filename)) ||
|
||||||
|
fully_indexed?(filename, checksums)
|
||||||
def metadata_file?(filename)
|
|
||||||
File.basename(filename) == METADATA_FILENAME
|
|
||||||
end
|
end
|
||||||
|
|
||||||
def fully_indexed?(filename, checksums)
|
def fully_indexed?(filename, checksums)
|
||||||
|
@ -230,5 +244,9 @@ module ImportScripts::Mbox
|
||||||
def calc_checksum(filename)
|
def calc_checksum(filename)
|
||||||
Digest::SHA256.file(filename).hexdigest
|
Digest::SHA256.file(filename).hexdigest
|
||||||
end
|
end
|
||||||
|
|
||||||
|
def monotonic_time
|
||||||
|
Process.clock_gettime(Process::CLOCK_MONOTONIC)
|
||||||
|
end
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
|
@ -13,6 +13,8 @@ module ImportScripts::Mbox
|
||||||
attr_reader :trust_level
|
attr_reader :trust_level
|
||||||
attr_reader :prefer_html
|
attr_reader :prefer_html
|
||||||
attr_reader :staged
|
attr_reader :staged
|
||||||
|
attr_reader :index_only
|
||||||
|
attr_reader :group_messages_by_subject
|
||||||
|
|
||||||
def initialize(yaml)
|
def initialize(yaml)
|
||||||
@data_dir = yaml['data_dir']
|
@data_dir = yaml['data_dir']
|
||||||
|
@ -21,6 +23,8 @@ module ImportScripts::Mbox
|
||||||
@trust_level = yaml['default_trust_level']
|
@trust_level = yaml['default_trust_level']
|
||||||
@prefer_html = yaml['prefer_html']
|
@prefer_html = yaml['prefer_html']
|
||||||
@staged = yaml['staged']
|
@staged = yaml['staged']
|
||||||
|
@index_only = yaml['index_only']
|
||||||
|
@group_messages_by_subject = yaml['group_messages_by_subject']
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
Loading…
Reference in New Issue