diff --git a/lib/email/receiver.rb b/lib/email/receiver.rb index e143c82b753..57061de3faa 100644 --- a/lib/email/receiver.rb +++ b/lib/email/receiver.rb @@ -239,7 +239,7 @@ module Email if text.present? text = trim_discourse_markers(text) - text, elided_text = EmailReplyTrimmer.trim(text, true) + text, elided_text = trim_reply_and_extract_elided(text) if @opts[:convert_plaintext] || sent_to_mailinglist_mirror? text_content_type ||= "" @@ -255,7 +255,7 @@ module Email markdown, elided_markdown = if html.present? markdown = HtmlToMarkdown.new(html, keep_img_tags: true, keep_cid_imgs: true).to_markdown markdown = trim_discourse_markers(markdown) - EmailReplyTrimmer.trim(markdown, true) + trim_reply_and_extract_elided(markdown) end if text.blank? || (SiteSetting.incoming_email_prefer_html && markdown.present?) @@ -265,6 +265,11 @@ module Email end end + def trim_reply_and_extract_elided(text) + return [text, ""] if @opts[:skip_trimming] + EmailReplyTrimmer.trim(text, true) + end + def fix_charset(mail_part) return nil if mail_part.blank? || mail_part.body.blank? diff --git a/script/import_scripts/base.rb b/script/import_scripts/base.rb index 4d177156b40..7f683818f7a 100644 --- a/script/import_scripts/base.rb +++ b/script/import_scripts/base.rb @@ -31,6 +31,7 @@ class ImportScripts::Base @site_settings_during_import = {} @old_site_settings = {} @start_times = { import: Time.now } + @skip_updates = false end def preload_i18n @@ -46,14 +47,16 @@ class ImportScripts::Base puts "" - update_bumped_at - update_last_posted_at - update_last_seen_at - update_user_stats - update_feature_topic_users - update_category_featured_topics - update_topic_count_replies - reset_topic_counters + unless @skip_updates + update_bumped_at + update_last_posted_at + update_last_seen_at + update_user_stats + update_feature_topic_users + update_category_featured_topics + update_topic_count_replies + reset_topic_counters + end elapsed = Time.now - @start_times[:import] puts '', '', 'Done (%02dh %02dmin %02dsec)' % [elapsed / 3600, elapsed / 60 % 60, elapsed % 60] diff --git a/script/import_scripts/mbox/importer.rb b/script/import_scripts/mbox/importer.rb index d894b24f9eb..3886f33bb69 100644 --- a/script/import_scripts/mbox/importer.rb +++ b/script/import_scripts/mbox/importer.rb @@ -24,9 +24,14 @@ module ImportScripts::Mbox def execute index_messages - import_categories - import_users - import_posts + + if @settings.index_only + @skip_updates = true + else + import_categories + import_users + import_posts + end end def index_messages diff --git a/script/import_scripts/mbox/settings.yml b/script/import_scripts/mbox/settings.yml index 942ee708d4e..a4b449a3d6f 100644 --- a/script/import_scripts/mbox/settings.yml +++ b/script/import_scripts/mbox/settings.yml @@ -1,11 +1,18 @@ -# PostgreSQL mailing lists -#data_dir: /shared/import/data -#split_regex: "^From .*@postgresql.org.*" - -# ruby-talk mailing list data_dir: /shared/import/data -split_regex: "" + +# mbox files +split_regex: "^From .+" +#split_regex: "^From .+@example.com.+" + +# individual emails +#split_regex: "" + +# Listserv files +#split_regex: "^=========================================================================" default_trust_level: 1 prefer_html: false staged: true +index_only: false + +group_messages_by_subject: false diff --git a/script/import_scripts/mbox/support/database.rb b/script/import_scripts/mbox/support/database.rb index 2a20960c25b..085dd2f9139 100644 --- a/script/import_scripts/mbox/support/database.rb +++ b/script/import_scripts/mbox/support/database.rb @@ -2,7 +2,7 @@ require 'sqlite3' module ImportScripts::Mbox class Database - SCHEMA_VERSION = 1 + SCHEMA_VERSION = 2 def initialize(directory, batch_size) @db = SQLite3::Database.new("#{directory}/index.db", results_as_hash: true) @@ -17,6 +17,15 @@ module ImportScripts::Mbox create_table_for_users end + def transaction + @db.transaction + yield self + @db.commit + + rescue + @db.rollback + end + def insert_category(category) @db.execute(<<-SQL, category) INSERT OR REPLACE INTO category (name, description) @@ -35,10 +44,10 @@ module ImportScripts::Mbox @db.execute(<<-SQL, email) INSERT OR REPLACE INTO email (msg_id, from_email, from_name, subject, email_date, raw_message, body, elided, format, attachment_count, charset, - category, filename, first_line_number, last_line_number) + category, filename, first_line_number, last_line_number, index_duration) VALUES (:msg_id, :from_email, :from_name, :subject, :email_date, :raw_message, :body, :elided, :format, :attachment_count, :charset, - :category, :filename, :first_line_number, :last_line_number) + :category, :filename, :first_line_number, :last_line_number, :index_duration) SQL end @@ -69,7 +78,21 @@ module ImportScripts::Mbox SQL end - def sort_emails + def update_in_reply_to_by_email_subject + @db.execute <<-SQL + UPDATE email + SET in_reply_to = NULLIF(( + SELECT e.msg_id + FROM email e + JOIN email_order o ON (e.msg_id = o.msg_id) + WHERE e.subject = email.subject + ORDER BY o.ROWID + LIMIT 1 + ), msg_id) + SQL + end + + def sort_emails_by_date_and_reply_level @db.execute 'DELETE FROM email_order' @db.execute <<-SQL @@ -90,6 +113,17 @@ module ImportScripts::Mbox SQL end + def sort_emails_by_subject + @db.execute 'DELETE FROM email_order' + + @db.execute <<-SQL + INSERT INTO email_order (msg_id) + SELECT msg_id + FROM email + ORDER BY subject, filename, ROWID + SQL + end + def fill_users_from_emails @db.execute 'DELETE FROM user' @@ -164,10 +198,17 @@ module ImportScripts::Mbox def configure_database @db.execute 'PRAGMA journal_mode = OFF' + @db.execute 'PRAGMA locking_mode = EXCLUSIVE' end def upgrade_schema_version - # current_version = query("PRAGMA user_version").last[0] + current_version = @db.get_first_value("PRAGMA user_version") + + case current_version + when 1 + @db.execute "ALTER TABLE email ADD COLUMN index_duration REAL" + end + @db.execute "PRAGMA user_version = #{SCHEMA_VERSION}" end @@ -211,11 +252,13 @@ module ImportScripts::Mbox filename TEXT NOT NULL, first_line_number INTEGER, last_line_number INTEGER, + index_duration REAL, FOREIGN KEY(category) REFERENCES category(name) ) SQL @db.execute 'CREATE INDEX IF NOT EXISTS email_by_from ON email (from_email)' + @db.execute 'CREATE INDEX IF NOT EXISTS email_by_subject ON email (subject)' @db.execute 'CREATE INDEX IF NOT EXISTS email_by_in_reply_to ON email (in_reply_to)' @db.execute 'CREATE INDEX IF NOT EXISTS email_by_date ON email (email_date)' diff --git a/script/import_scripts/mbox/support/indexer.rb b/script/import_scripts/mbox/support/indexer.rb index 251975f07da..e50f74449e7 100644 --- a/script/import_scripts/mbox/support/indexer.rb +++ b/script/import_scripts/mbox/support/indexer.rb @@ -8,12 +8,12 @@ module ImportScripts::Mbox # @param settings [ImportScripts::Mbox::Settings] def initialize(database, settings) @database = database - @root_directory = settings.data_dir + @settings = settings @split_regex = settings.split_regex end def execute - directories = Dir.glob(File.join(@root_directory, '*')) + directories = Dir.glob(File.join(@settings.data_dir, '*')) directories.select! { |f| File.directory?(f) } directories.sort! @@ -24,14 +24,21 @@ module ImportScripts::Mbox end puts '', 'indexing replies and users' - @database.update_in_reply_to_of_emails - @database.sort_emails + if @settings.group_messages_by_subject + @database.sort_emails_by_subject + @database.update_in_reply_to_by_email_subject + else + @database.update_in_reply_to_of_emails + @database.sort_emails_by_date_and_reply_level + end + @database.fill_users_from_emails end private METADATA_FILENAME = 'metadata.yml'.freeze + IGNORED_FILE_EXTENSIONS = ['.dbindex', '.dbnames', '.digest', '.subjects'] def index_category(directory) metadata_file = File.join(directory, METADATA_FILENAME) @@ -54,7 +61,7 @@ module ImportScripts::Mbox end def index_emails(directory, category_name) - all_messages(directory, category_name) do |receiver, filename, first_line_number, last_line_number| + all_messages(directory, category_name) do |receiver, filename, opts| msg_id = receiver.message_id parsed_email = receiver.mail from_email, from_display_name = receiver.parse_from_field(parsed_email) @@ -75,12 +82,15 @@ module ImportScripts::Mbox charset: parsed_email.charset&.downcase, category: category_name, filename: File.basename(filename), - first_line_number: first_line_number, - last_line_number: last_line_number + first_line_number: opts[:first_line_number], + last_line_number: opts[:last_line_number], + index_duration: (monotonic_time - opts[:start_time]).round(4) } - @database.insert_email(email) - @database.insert_replies(msg_id, reply_message_ids) unless reply_message_ids.empty? + @database.transaction do |db| + db.insert_email(email) + db.insert_replies(msg_id, reply_message_ids) unless reply_message_ids.empty? + end end end @@ -102,12 +112,18 @@ module ImportScripts::Mbox if @split_regex.present? each_mail(filename) do |raw_message, first_line_number, last_line_number| + opts = { + first_line_number: first_line_number, + last_line_number: last_line_number, + start_time: monotonic_time + } receiver = read_mail_from_string(raw_message) - yield receiver, filename, first_line_number, last_line_number if receiver.present? + yield receiver, filename, opts if receiver.present? end else + opts = { start_time: monotonic_time } receiver = read_mail_from_file(filename) - yield receiver, filename if receiver.present? + yield receiver, filename, opts if receiver.present? end mark_as_fully_indexed(category_name, filename) @@ -132,10 +148,12 @@ module ImportScripts::Mbox each_line(filename) do |line| line = line.scrub - if line =~ @split_regex && last_line_number.positive? - yield raw_message, first_line_number, last_line_number - raw_message = '' - first_line_number = last_line_number + 1 + if line =~ @split_regex + if last_line_number > 0 + yield raw_message, first_line_number, last_line_number + raw_message = '' + first_line_number = last_line_number + 1 + end else raw_message << line end @@ -163,7 +181,7 @@ module ImportScripts::Mbox end def read_mail_from_string(raw_message) - Email::Receiver.new(raw_message, convert_plaintext: true) unless raw_message.blank? + Email::Receiver.new(raw_message, convert_plaintext: true, skip_trimming: false) unless raw_message.blank? end def extract_reply_message_ids(mail) @@ -210,16 +228,12 @@ module ImportScripts::Mbox end def ignored_file?(filename, checksums) - File.directory?(filename) || hidden_file?(filename) || - metadata_file?(filename) || fully_indexed?(filename, checksums) - end + filename = File.basename(filename) - def hidden_file?(filename) - File.basename(filename).start_with?('.') - end - - def metadata_file?(filename) - File.basename(filename) == METADATA_FILENAME + filename.start_with?('.') || + filename == METADATA_FILENAME || + IGNORED_FILE_EXTENSIONS.include?(File.extname(filename)) || + fully_indexed?(filename, checksums) end def fully_indexed?(filename, checksums) @@ -230,5 +244,9 @@ module ImportScripts::Mbox def calc_checksum(filename) Digest::SHA256.file(filename).hexdigest end + + def monotonic_time + Process.clock_gettime(Process::CLOCK_MONOTONIC) + end end end diff --git a/script/import_scripts/mbox/support/settings.rb b/script/import_scripts/mbox/support/settings.rb index a1fc74d3d1e..703ec2220ad 100644 --- a/script/import_scripts/mbox/support/settings.rb +++ b/script/import_scripts/mbox/support/settings.rb @@ -13,6 +13,8 @@ module ImportScripts::Mbox attr_reader :trust_level attr_reader :prefer_html attr_reader :staged + attr_reader :index_only + attr_reader :group_messages_by_subject def initialize(yaml) @data_dir = yaml['data_dir'] @@ -21,6 +23,8 @@ module ImportScripts::Mbox @trust_level = yaml['default_trust_level'] @prefer_html = yaml['prefer_html'] @staged = yaml['staged'] + @index_only = yaml['index_only'] + @group_messages_by_subject = yaml['group_messages_by_subject'] end end end