Improvements to mbox importer

* store time it took to index message in DB (to find performance issues)
* ignore listserv specific files
* better examples for split_regex
* first email in mbox shouldn't contain the split string
* always lock the DB in exclusive mode
* save email within transaction
* messages can be grouped by subject and use original order (for Listserv)
* adds option to index emails without running the import
This commit is contained in:
Gerhard Schlager 2018-01-17 12:03:57 +01:00
parent 5d7a33cd6d
commit bb54eb1192
7 changed files with 134 additions and 49 deletions

View File

@ -239,7 +239,7 @@ module Email
if text.present? if text.present?
text = trim_discourse_markers(text) text = trim_discourse_markers(text)
text, elided_text = EmailReplyTrimmer.trim(text, true) text, elided_text = trim_reply_and_extract_elided(text)
if @opts[:convert_plaintext] || sent_to_mailinglist_mirror? if @opts[:convert_plaintext] || sent_to_mailinglist_mirror?
text_content_type ||= "" text_content_type ||= ""
@ -255,7 +255,7 @@ module Email
markdown, elided_markdown = if html.present? markdown, elided_markdown = if html.present?
markdown = HtmlToMarkdown.new(html, keep_img_tags: true, keep_cid_imgs: true).to_markdown markdown = HtmlToMarkdown.new(html, keep_img_tags: true, keep_cid_imgs: true).to_markdown
markdown = trim_discourse_markers(markdown) markdown = trim_discourse_markers(markdown)
EmailReplyTrimmer.trim(markdown, true) trim_reply_and_extract_elided(markdown)
end end
if text.blank? || (SiteSetting.incoming_email_prefer_html && markdown.present?) if text.blank? || (SiteSetting.incoming_email_prefer_html && markdown.present?)
@ -265,6 +265,11 @@ module Email
end end
end end
def trim_reply_and_extract_elided(text)
return [text, ""] if @opts[:skip_trimming]
EmailReplyTrimmer.trim(text, true)
end
def fix_charset(mail_part) def fix_charset(mail_part)
return nil if mail_part.blank? || mail_part.body.blank? return nil if mail_part.blank? || mail_part.body.blank?

View File

@ -31,6 +31,7 @@ class ImportScripts::Base
@site_settings_during_import = {} @site_settings_during_import = {}
@old_site_settings = {} @old_site_settings = {}
@start_times = { import: Time.now } @start_times = { import: Time.now }
@skip_updates = false
end end
def preload_i18n def preload_i18n
@ -46,6 +47,7 @@ class ImportScripts::Base
puts "" puts ""
unless @skip_updates
update_bumped_at update_bumped_at
update_last_posted_at update_last_posted_at
update_last_seen_at update_last_seen_at
@ -54,6 +56,7 @@ class ImportScripts::Base
update_category_featured_topics update_category_featured_topics
update_topic_count_replies update_topic_count_replies
reset_topic_counters reset_topic_counters
end
elapsed = Time.now - @start_times[:import] elapsed = Time.now - @start_times[:import]
puts '', '', 'Done (%02dh %02dmin %02dsec)' % [elapsed / 3600, elapsed / 60 % 60, elapsed % 60] puts '', '', 'Done (%02dh %02dmin %02dsec)' % [elapsed / 3600, elapsed / 60 % 60, elapsed % 60]

View File

@ -24,10 +24,15 @@ module ImportScripts::Mbox
def execute def execute
index_messages index_messages
if @settings.index_only
@skip_updates = true
else
import_categories import_categories
import_users import_users
import_posts import_posts
end end
end
def index_messages def index_messages
puts '', 'creating index' puts '', 'creating index'

View File

@ -1,11 +1,18 @@
# PostgreSQL mailing lists
#data_dir: /shared/import/data
#split_regex: "^From .*@postgresql.org.*"
# ruby-talk mailing list
data_dir: /shared/import/data data_dir: /shared/import/data
split_regex: ""
# mbox files
split_regex: "^From .+"
#split_regex: "^From .+@example.com.+"
# individual emails
#split_regex: ""
# Listserv files
#split_regex: "^========================================================================="
default_trust_level: 1 default_trust_level: 1
prefer_html: false prefer_html: false
staged: true staged: true
index_only: false
group_messages_by_subject: false

View File

@ -2,7 +2,7 @@ require 'sqlite3'
module ImportScripts::Mbox module ImportScripts::Mbox
class Database class Database
SCHEMA_VERSION = 1 SCHEMA_VERSION = 2
def initialize(directory, batch_size) def initialize(directory, batch_size)
@db = SQLite3::Database.new("#{directory}/index.db", results_as_hash: true) @db = SQLite3::Database.new("#{directory}/index.db", results_as_hash: true)
@ -17,6 +17,15 @@ module ImportScripts::Mbox
create_table_for_users create_table_for_users
end end
def transaction
@db.transaction
yield self
@db.commit
rescue
@db.rollback
end
def insert_category(category) def insert_category(category)
@db.execute(<<-SQL, category) @db.execute(<<-SQL, category)
INSERT OR REPLACE INTO category (name, description) INSERT OR REPLACE INTO category (name, description)
@ -35,10 +44,10 @@ module ImportScripts::Mbox
@db.execute(<<-SQL, email) @db.execute(<<-SQL, email)
INSERT OR REPLACE INTO email (msg_id, from_email, from_name, subject, INSERT OR REPLACE INTO email (msg_id, from_email, from_name, subject,
email_date, raw_message, body, elided, format, attachment_count, charset, email_date, raw_message, body, elided, format, attachment_count, charset,
category, filename, first_line_number, last_line_number) category, filename, first_line_number, last_line_number, index_duration)
VALUES (:msg_id, :from_email, :from_name, :subject, VALUES (:msg_id, :from_email, :from_name, :subject,
:email_date, :raw_message, :body, :elided, :format, :attachment_count, :charset, :email_date, :raw_message, :body, :elided, :format, :attachment_count, :charset,
:category, :filename, :first_line_number, :last_line_number) :category, :filename, :first_line_number, :last_line_number, :index_duration)
SQL SQL
end end
@ -69,7 +78,21 @@ module ImportScripts::Mbox
SQL SQL
end end
def sort_emails def update_in_reply_to_by_email_subject
@db.execute <<-SQL
UPDATE email
SET in_reply_to = NULLIF((
SELECT e.msg_id
FROM email e
JOIN email_order o ON (e.msg_id = o.msg_id)
WHERE e.subject = email.subject
ORDER BY o.ROWID
LIMIT 1
), msg_id)
SQL
end
def sort_emails_by_date_and_reply_level
@db.execute 'DELETE FROM email_order' @db.execute 'DELETE FROM email_order'
@db.execute <<-SQL @db.execute <<-SQL
@ -90,6 +113,17 @@ module ImportScripts::Mbox
SQL SQL
end end
def sort_emails_by_subject
@db.execute 'DELETE FROM email_order'
@db.execute <<-SQL
INSERT INTO email_order (msg_id)
SELECT msg_id
FROM email
ORDER BY subject, filename, ROWID
SQL
end
def fill_users_from_emails def fill_users_from_emails
@db.execute 'DELETE FROM user' @db.execute 'DELETE FROM user'
@ -164,10 +198,17 @@ module ImportScripts::Mbox
def configure_database def configure_database
@db.execute 'PRAGMA journal_mode = OFF' @db.execute 'PRAGMA journal_mode = OFF'
@db.execute 'PRAGMA locking_mode = EXCLUSIVE'
end end
def upgrade_schema_version def upgrade_schema_version
# current_version = query("PRAGMA user_version").last[0] current_version = @db.get_first_value("PRAGMA user_version")
case current_version
when 1
@db.execute "ALTER TABLE email ADD COLUMN index_duration REAL"
end
@db.execute "PRAGMA user_version = #{SCHEMA_VERSION}" @db.execute "PRAGMA user_version = #{SCHEMA_VERSION}"
end end
@ -211,11 +252,13 @@ module ImportScripts::Mbox
filename TEXT NOT NULL, filename TEXT NOT NULL,
first_line_number INTEGER, first_line_number INTEGER,
last_line_number INTEGER, last_line_number INTEGER,
index_duration REAL,
FOREIGN KEY(category) REFERENCES category(name) FOREIGN KEY(category) REFERENCES category(name)
) )
SQL SQL
@db.execute 'CREATE INDEX IF NOT EXISTS email_by_from ON email (from_email)' @db.execute 'CREATE INDEX IF NOT EXISTS email_by_from ON email (from_email)'
@db.execute 'CREATE INDEX IF NOT EXISTS email_by_subject ON email (subject)'
@db.execute 'CREATE INDEX IF NOT EXISTS email_by_in_reply_to ON email (in_reply_to)' @db.execute 'CREATE INDEX IF NOT EXISTS email_by_in_reply_to ON email (in_reply_to)'
@db.execute 'CREATE INDEX IF NOT EXISTS email_by_date ON email (email_date)' @db.execute 'CREATE INDEX IF NOT EXISTS email_by_date ON email (email_date)'

View File

@ -8,12 +8,12 @@ module ImportScripts::Mbox
# @param settings [ImportScripts::Mbox::Settings] # @param settings [ImportScripts::Mbox::Settings]
def initialize(database, settings) def initialize(database, settings)
@database = database @database = database
@root_directory = settings.data_dir @settings = settings
@split_regex = settings.split_regex @split_regex = settings.split_regex
end end
def execute def execute
directories = Dir.glob(File.join(@root_directory, '*')) directories = Dir.glob(File.join(@settings.data_dir, '*'))
directories.select! { |f| File.directory?(f) } directories.select! { |f| File.directory?(f) }
directories.sort! directories.sort!
@ -24,14 +24,21 @@ module ImportScripts::Mbox
end end
puts '', 'indexing replies and users' puts '', 'indexing replies and users'
if @settings.group_messages_by_subject
@database.sort_emails_by_subject
@database.update_in_reply_to_by_email_subject
else
@database.update_in_reply_to_of_emails @database.update_in_reply_to_of_emails
@database.sort_emails @database.sort_emails_by_date_and_reply_level
end
@database.fill_users_from_emails @database.fill_users_from_emails
end end
private private
METADATA_FILENAME = 'metadata.yml'.freeze METADATA_FILENAME = 'metadata.yml'.freeze
IGNORED_FILE_EXTENSIONS = ['.dbindex', '.dbnames', '.digest', '.subjects']
def index_category(directory) def index_category(directory)
metadata_file = File.join(directory, METADATA_FILENAME) metadata_file = File.join(directory, METADATA_FILENAME)
@ -54,7 +61,7 @@ module ImportScripts::Mbox
end end
def index_emails(directory, category_name) def index_emails(directory, category_name)
all_messages(directory, category_name) do |receiver, filename, first_line_number, last_line_number| all_messages(directory, category_name) do |receiver, filename, opts|
msg_id = receiver.message_id msg_id = receiver.message_id
parsed_email = receiver.mail parsed_email = receiver.mail
from_email, from_display_name = receiver.parse_from_field(parsed_email) from_email, from_display_name = receiver.parse_from_field(parsed_email)
@ -75,12 +82,15 @@ module ImportScripts::Mbox
charset: parsed_email.charset&.downcase, charset: parsed_email.charset&.downcase,
category: category_name, category: category_name,
filename: File.basename(filename), filename: File.basename(filename),
first_line_number: first_line_number, first_line_number: opts[:first_line_number],
last_line_number: last_line_number last_line_number: opts[:last_line_number],
index_duration: (monotonic_time - opts[:start_time]).round(4)
} }
@database.insert_email(email) @database.transaction do |db|
@database.insert_replies(msg_id, reply_message_ids) unless reply_message_ids.empty? db.insert_email(email)
db.insert_replies(msg_id, reply_message_ids) unless reply_message_ids.empty?
end
end end
end end
@ -102,12 +112,18 @@ module ImportScripts::Mbox
if @split_regex.present? if @split_regex.present?
each_mail(filename) do |raw_message, first_line_number, last_line_number| each_mail(filename) do |raw_message, first_line_number, last_line_number|
opts = {
first_line_number: first_line_number,
last_line_number: last_line_number,
start_time: monotonic_time
}
receiver = read_mail_from_string(raw_message) receiver = read_mail_from_string(raw_message)
yield receiver, filename, first_line_number, last_line_number if receiver.present? yield receiver, filename, opts if receiver.present?
end end
else else
opts = { start_time: monotonic_time }
receiver = read_mail_from_file(filename) receiver = read_mail_from_file(filename)
yield receiver, filename if receiver.present? yield receiver, filename, opts if receiver.present?
end end
mark_as_fully_indexed(category_name, filename) mark_as_fully_indexed(category_name, filename)
@ -132,10 +148,12 @@ module ImportScripts::Mbox
each_line(filename) do |line| each_line(filename) do |line|
line = line.scrub line = line.scrub
if line =~ @split_regex && last_line_number.positive? if line =~ @split_regex
if last_line_number > 0
yield raw_message, first_line_number, last_line_number yield raw_message, first_line_number, last_line_number
raw_message = '' raw_message = ''
first_line_number = last_line_number + 1 first_line_number = last_line_number + 1
end
else else
raw_message << line raw_message << line
end end
@ -163,7 +181,7 @@ module ImportScripts::Mbox
end end
def read_mail_from_string(raw_message) def read_mail_from_string(raw_message)
Email::Receiver.new(raw_message, convert_plaintext: true) unless raw_message.blank? Email::Receiver.new(raw_message, convert_plaintext: true, skip_trimming: false) unless raw_message.blank?
end end
def extract_reply_message_ids(mail) def extract_reply_message_ids(mail)
@ -210,16 +228,12 @@ module ImportScripts::Mbox
end end
def ignored_file?(filename, checksums) def ignored_file?(filename, checksums)
File.directory?(filename) || hidden_file?(filename) || filename = File.basename(filename)
metadata_file?(filename) || fully_indexed?(filename, checksums)
end
def hidden_file?(filename) filename.start_with?('.') ||
File.basename(filename).start_with?('.') filename == METADATA_FILENAME ||
end IGNORED_FILE_EXTENSIONS.include?(File.extname(filename)) ||
fully_indexed?(filename, checksums)
def metadata_file?(filename)
File.basename(filename) == METADATA_FILENAME
end end
def fully_indexed?(filename, checksums) def fully_indexed?(filename, checksums)
@ -230,5 +244,9 @@ module ImportScripts::Mbox
def calc_checksum(filename) def calc_checksum(filename)
Digest::SHA256.file(filename).hexdigest Digest::SHA256.file(filename).hexdigest
end end
def monotonic_time
Process.clock_gettime(Process::CLOCK_MONOTONIC)
end
end end
end end

View File

@ -13,6 +13,8 @@ module ImportScripts::Mbox
attr_reader :trust_level attr_reader :trust_level
attr_reader :prefer_html attr_reader :prefer_html
attr_reader :staged attr_reader :staged
attr_reader :index_only
attr_reader :group_messages_by_subject
def initialize(yaml) def initialize(yaml)
@data_dir = yaml['data_dir'] @data_dir = yaml['data_dir']
@ -21,6 +23,8 @@ module ImportScripts::Mbox
@trust_level = yaml['default_trust_level'] @trust_level = yaml['default_trust_level']
@prefer_html = yaml['prefer_html'] @prefer_html = yaml['prefer_html']
@staged = yaml['staged'] @staged = yaml['staged']
@index_only = yaml['index_only']
@group_messages_by_subject = yaml['group_messages_by_subject']
end end
end end
end end