Improvements to mbox importer
* store time it took to index message in DB (to find performance issues) * ignore listserv specific files * better examples for split_regex * first email in mbox shouldn't contain the split string * always lock the DB in exclusive mode * save email within transaction * messages can be grouped by subject and use original order (for Listserv) * adds option to index emails without running the import
This commit is contained in:
parent
5d7a33cd6d
commit
bb54eb1192
|
@ -239,7 +239,7 @@ module Email
|
|||
|
||||
if text.present?
|
||||
text = trim_discourse_markers(text)
|
||||
text, elided_text = EmailReplyTrimmer.trim(text, true)
|
||||
text, elided_text = trim_reply_and_extract_elided(text)
|
||||
|
||||
if @opts[:convert_plaintext] || sent_to_mailinglist_mirror?
|
||||
text_content_type ||= ""
|
||||
|
@ -255,7 +255,7 @@ module Email
|
|||
markdown, elided_markdown = if html.present?
|
||||
markdown = HtmlToMarkdown.new(html, keep_img_tags: true, keep_cid_imgs: true).to_markdown
|
||||
markdown = trim_discourse_markers(markdown)
|
||||
EmailReplyTrimmer.trim(markdown, true)
|
||||
trim_reply_and_extract_elided(markdown)
|
||||
end
|
||||
|
||||
if text.blank? || (SiteSetting.incoming_email_prefer_html && markdown.present?)
|
||||
|
@ -265,6 +265,11 @@ module Email
|
|||
end
|
||||
end
|
||||
|
||||
def trim_reply_and_extract_elided(text)
|
||||
return [text, ""] if @opts[:skip_trimming]
|
||||
EmailReplyTrimmer.trim(text, true)
|
||||
end
|
||||
|
||||
def fix_charset(mail_part)
|
||||
return nil if mail_part.blank? || mail_part.body.blank?
|
||||
|
||||
|
|
|
@ -31,6 +31,7 @@ class ImportScripts::Base
|
|||
@site_settings_during_import = {}
|
||||
@old_site_settings = {}
|
||||
@start_times = { import: Time.now }
|
||||
@skip_updates = false
|
||||
end
|
||||
|
||||
def preload_i18n
|
||||
|
@ -46,14 +47,16 @@ class ImportScripts::Base
|
|||
|
||||
puts ""
|
||||
|
||||
update_bumped_at
|
||||
update_last_posted_at
|
||||
update_last_seen_at
|
||||
update_user_stats
|
||||
update_feature_topic_users
|
||||
update_category_featured_topics
|
||||
update_topic_count_replies
|
||||
reset_topic_counters
|
||||
unless @skip_updates
|
||||
update_bumped_at
|
||||
update_last_posted_at
|
||||
update_last_seen_at
|
||||
update_user_stats
|
||||
update_feature_topic_users
|
||||
update_category_featured_topics
|
||||
update_topic_count_replies
|
||||
reset_topic_counters
|
||||
end
|
||||
|
||||
elapsed = Time.now - @start_times[:import]
|
||||
puts '', '', 'Done (%02dh %02dmin %02dsec)' % [elapsed / 3600, elapsed / 60 % 60, elapsed % 60]
|
||||
|
|
|
@ -24,9 +24,14 @@ module ImportScripts::Mbox
|
|||
|
||||
def execute
|
||||
index_messages
|
||||
import_categories
|
||||
import_users
|
||||
import_posts
|
||||
|
||||
if @settings.index_only
|
||||
@skip_updates = true
|
||||
else
|
||||
import_categories
|
||||
import_users
|
||||
import_posts
|
||||
end
|
||||
end
|
||||
|
||||
def index_messages
|
||||
|
|
|
@ -1,11 +1,18 @@
|
|||
# PostgreSQL mailing lists
|
||||
#data_dir: /shared/import/data
|
||||
#split_regex: "^From .*@postgresql.org.*"
|
||||
|
||||
# ruby-talk mailing list
|
||||
data_dir: /shared/import/data
|
||||
split_regex: ""
|
||||
|
||||
# mbox files
|
||||
split_regex: "^From .+"
|
||||
#split_regex: "^From .+@example.com.+"
|
||||
|
||||
# individual emails
|
||||
#split_regex: ""
|
||||
|
||||
# Listserv files
|
||||
#split_regex: "^========================================================================="
|
||||
|
||||
default_trust_level: 1
|
||||
prefer_html: false
|
||||
staged: true
|
||||
index_only: false
|
||||
|
||||
group_messages_by_subject: false
|
||||
|
|
|
@ -2,7 +2,7 @@ require 'sqlite3'
|
|||
|
||||
module ImportScripts::Mbox
|
||||
class Database
|
||||
SCHEMA_VERSION = 1
|
||||
SCHEMA_VERSION = 2
|
||||
|
||||
def initialize(directory, batch_size)
|
||||
@db = SQLite3::Database.new("#{directory}/index.db", results_as_hash: true)
|
||||
|
@ -17,6 +17,15 @@ module ImportScripts::Mbox
|
|||
create_table_for_users
|
||||
end
|
||||
|
||||
def transaction
|
||||
@db.transaction
|
||||
yield self
|
||||
@db.commit
|
||||
|
||||
rescue
|
||||
@db.rollback
|
||||
end
|
||||
|
||||
def insert_category(category)
|
||||
@db.execute(<<-SQL, category)
|
||||
INSERT OR REPLACE INTO category (name, description)
|
||||
|
@ -35,10 +44,10 @@ module ImportScripts::Mbox
|
|||
@db.execute(<<-SQL, email)
|
||||
INSERT OR REPLACE INTO email (msg_id, from_email, from_name, subject,
|
||||
email_date, raw_message, body, elided, format, attachment_count, charset,
|
||||
category, filename, first_line_number, last_line_number)
|
||||
category, filename, first_line_number, last_line_number, index_duration)
|
||||
VALUES (:msg_id, :from_email, :from_name, :subject,
|
||||
:email_date, :raw_message, :body, :elided, :format, :attachment_count, :charset,
|
||||
:category, :filename, :first_line_number, :last_line_number)
|
||||
:category, :filename, :first_line_number, :last_line_number, :index_duration)
|
||||
SQL
|
||||
end
|
||||
|
||||
|
@ -69,7 +78,21 @@ module ImportScripts::Mbox
|
|||
SQL
|
||||
end
|
||||
|
||||
def sort_emails
|
||||
def update_in_reply_to_by_email_subject
|
||||
@db.execute <<-SQL
|
||||
UPDATE email
|
||||
SET in_reply_to = NULLIF((
|
||||
SELECT e.msg_id
|
||||
FROM email e
|
||||
JOIN email_order o ON (e.msg_id = o.msg_id)
|
||||
WHERE e.subject = email.subject
|
||||
ORDER BY o.ROWID
|
||||
LIMIT 1
|
||||
), msg_id)
|
||||
SQL
|
||||
end
|
||||
|
||||
def sort_emails_by_date_and_reply_level
|
||||
@db.execute 'DELETE FROM email_order'
|
||||
|
||||
@db.execute <<-SQL
|
||||
|
@ -90,6 +113,17 @@ module ImportScripts::Mbox
|
|||
SQL
|
||||
end
|
||||
|
||||
def sort_emails_by_subject
|
||||
@db.execute 'DELETE FROM email_order'
|
||||
|
||||
@db.execute <<-SQL
|
||||
INSERT INTO email_order (msg_id)
|
||||
SELECT msg_id
|
||||
FROM email
|
||||
ORDER BY subject, filename, ROWID
|
||||
SQL
|
||||
end
|
||||
|
||||
def fill_users_from_emails
|
||||
@db.execute 'DELETE FROM user'
|
||||
|
||||
|
@ -164,10 +198,17 @@ module ImportScripts::Mbox
|
|||
|
||||
def configure_database
|
||||
@db.execute 'PRAGMA journal_mode = OFF'
|
||||
@db.execute 'PRAGMA locking_mode = EXCLUSIVE'
|
||||
end
|
||||
|
||||
def upgrade_schema_version
|
||||
# current_version = query("PRAGMA user_version").last[0]
|
||||
current_version = @db.get_first_value("PRAGMA user_version")
|
||||
|
||||
case current_version
|
||||
when 1
|
||||
@db.execute "ALTER TABLE email ADD COLUMN index_duration REAL"
|
||||
end
|
||||
|
||||
@db.execute "PRAGMA user_version = #{SCHEMA_VERSION}"
|
||||
end
|
||||
|
||||
|
@ -211,11 +252,13 @@ module ImportScripts::Mbox
|
|||
filename TEXT NOT NULL,
|
||||
first_line_number INTEGER,
|
||||
last_line_number INTEGER,
|
||||
index_duration REAL,
|
||||
FOREIGN KEY(category) REFERENCES category(name)
|
||||
)
|
||||
SQL
|
||||
|
||||
@db.execute 'CREATE INDEX IF NOT EXISTS email_by_from ON email (from_email)'
|
||||
@db.execute 'CREATE INDEX IF NOT EXISTS email_by_subject ON email (subject)'
|
||||
@db.execute 'CREATE INDEX IF NOT EXISTS email_by_in_reply_to ON email (in_reply_to)'
|
||||
@db.execute 'CREATE INDEX IF NOT EXISTS email_by_date ON email (email_date)'
|
||||
|
||||
|
|
|
@ -8,12 +8,12 @@ module ImportScripts::Mbox
|
|||
# @param settings [ImportScripts::Mbox::Settings]
|
||||
def initialize(database, settings)
|
||||
@database = database
|
||||
@root_directory = settings.data_dir
|
||||
@settings = settings
|
||||
@split_regex = settings.split_regex
|
||||
end
|
||||
|
||||
def execute
|
||||
directories = Dir.glob(File.join(@root_directory, '*'))
|
||||
directories = Dir.glob(File.join(@settings.data_dir, '*'))
|
||||
directories.select! { |f| File.directory?(f) }
|
||||
directories.sort!
|
||||
|
||||
|
@ -24,14 +24,21 @@ module ImportScripts::Mbox
|
|||
end
|
||||
|
||||
puts '', 'indexing replies and users'
|
||||
@database.update_in_reply_to_of_emails
|
||||
@database.sort_emails
|
||||
if @settings.group_messages_by_subject
|
||||
@database.sort_emails_by_subject
|
||||
@database.update_in_reply_to_by_email_subject
|
||||
else
|
||||
@database.update_in_reply_to_of_emails
|
||||
@database.sort_emails_by_date_and_reply_level
|
||||
end
|
||||
|
||||
@database.fill_users_from_emails
|
||||
end
|
||||
|
||||
private
|
||||
|
||||
METADATA_FILENAME = 'metadata.yml'.freeze
|
||||
IGNORED_FILE_EXTENSIONS = ['.dbindex', '.dbnames', '.digest', '.subjects']
|
||||
|
||||
def index_category(directory)
|
||||
metadata_file = File.join(directory, METADATA_FILENAME)
|
||||
|
@ -54,7 +61,7 @@ module ImportScripts::Mbox
|
|||
end
|
||||
|
||||
def index_emails(directory, category_name)
|
||||
all_messages(directory, category_name) do |receiver, filename, first_line_number, last_line_number|
|
||||
all_messages(directory, category_name) do |receiver, filename, opts|
|
||||
msg_id = receiver.message_id
|
||||
parsed_email = receiver.mail
|
||||
from_email, from_display_name = receiver.parse_from_field(parsed_email)
|
||||
|
@ -75,12 +82,15 @@ module ImportScripts::Mbox
|
|||
charset: parsed_email.charset&.downcase,
|
||||
category: category_name,
|
||||
filename: File.basename(filename),
|
||||
first_line_number: first_line_number,
|
||||
last_line_number: last_line_number
|
||||
first_line_number: opts[:first_line_number],
|
||||
last_line_number: opts[:last_line_number],
|
||||
index_duration: (monotonic_time - opts[:start_time]).round(4)
|
||||
}
|
||||
|
||||
@database.insert_email(email)
|
||||
@database.insert_replies(msg_id, reply_message_ids) unless reply_message_ids.empty?
|
||||
@database.transaction do |db|
|
||||
db.insert_email(email)
|
||||
db.insert_replies(msg_id, reply_message_ids) unless reply_message_ids.empty?
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
|
@ -102,12 +112,18 @@ module ImportScripts::Mbox
|
|||
|
||||
if @split_regex.present?
|
||||
each_mail(filename) do |raw_message, first_line_number, last_line_number|
|
||||
opts = {
|
||||
first_line_number: first_line_number,
|
||||
last_line_number: last_line_number,
|
||||
start_time: monotonic_time
|
||||
}
|
||||
receiver = read_mail_from_string(raw_message)
|
||||
yield receiver, filename, first_line_number, last_line_number if receiver.present?
|
||||
yield receiver, filename, opts if receiver.present?
|
||||
end
|
||||
else
|
||||
opts = { start_time: monotonic_time }
|
||||
receiver = read_mail_from_file(filename)
|
||||
yield receiver, filename if receiver.present?
|
||||
yield receiver, filename, opts if receiver.present?
|
||||
end
|
||||
|
||||
mark_as_fully_indexed(category_name, filename)
|
||||
|
@ -132,10 +148,12 @@ module ImportScripts::Mbox
|
|||
each_line(filename) do |line|
|
||||
line = line.scrub
|
||||
|
||||
if line =~ @split_regex && last_line_number.positive?
|
||||
yield raw_message, first_line_number, last_line_number
|
||||
raw_message = ''
|
||||
first_line_number = last_line_number + 1
|
||||
if line =~ @split_regex
|
||||
if last_line_number > 0
|
||||
yield raw_message, first_line_number, last_line_number
|
||||
raw_message = ''
|
||||
first_line_number = last_line_number + 1
|
||||
end
|
||||
else
|
||||
raw_message << line
|
||||
end
|
||||
|
@ -163,7 +181,7 @@ module ImportScripts::Mbox
|
|||
end
|
||||
|
||||
def read_mail_from_string(raw_message)
|
||||
Email::Receiver.new(raw_message, convert_plaintext: true) unless raw_message.blank?
|
||||
Email::Receiver.new(raw_message, convert_plaintext: true, skip_trimming: false) unless raw_message.blank?
|
||||
end
|
||||
|
||||
def extract_reply_message_ids(mail)
|
||||
|
@ -210,16 +228,12 @@ module ImportScripts::Mbox
|
|||
end
|
||||
|
||||
def ignored_file?(filename, checksums)
|
||||
File.directory?(filename) || hidden_file?(filename) ||
|
||||
metadata_file?(filename) || fully_indexed?(filename, checksums)
|
||||
end
|
||||
filename = File.basename(filename)
|
||||
|
||||
def hidden_file?(filename)
|
||||
File.basename(filename).start_with?('.')
|
||||
end
|
||||
|
||||
def metadata_file?(filename)
|
||||
File.basename(filename) == METADATA_FILENAME
|
||||
filename.start_with?('.') ||
|
||||
filename == METADATA_FILENAME ||
|
||||
IGNORED_FILE_EXTENSIONS.include?(File.extname(filename)) ||
|
||||
fully_indexed?(filename, checksums)
|
||||
end
|
||||
|
||||
def fully_indexed?(filename, checksums)
|
||||
|
@ -230,5 +244,9 @@ module ImportScripts::Mbox
|
|||
def calc_checksum(filename)
|
||||
Digest::SHA256.file(filename).hexdigest
|
||||
end
|
||||
|
||||
def monotonic_time
|
||||
Process.clock_gettime(Process::CLOCK_MONOTONIC)
|
||||
end
|
||||
end
|
||||
end
|
||||
|
|
|
@ -13,6 +13,8 @@ module ImportScripts::Mbox
|
|||
attr_reader :trust_level
|
||||
attr_reader :prefer_html
|
||||
attr_reader :staged
|
||||
attr_reader :index_only
|
||||
attr_reader :group_messages_by_subject
|
||||
|
||||
def initialize(yaml)
|
||||
@data_dir = yaml['data_dir']
|
||||
|
@ -21,6 +23,8 @@ module ImportScripts::Mbox
|
|||
@trust_level = yaml['default_trust_level']
|
||||
@prefer_html = yaml['prefer_html']
|
||||
@staged = yaml['staged']
|
||||
@index_only = yaml['index_only']
|
||||
@group_messages_by_subject = yaml['group_messages_by_subject']
|
||||
end
|
||||
end
|
||||
end
|
||||
|
|
Loading…
Reference in New Issue