From 0e752db41106820554f1ce5c8b2a27762e161485 Mon Sep 17 00:00:00 2001 From: Gerhard Schlager Date: Sun, 8 Mar 2020 01:20:02 +0100 Subject: [PATCH] DEV: Improve mbox import script * Customizable email subject prefixes to remove "Re" and "Fwd" as well as localized prefixes. * Configuration option for prefixes like [FOO] or (BAR) which can be replaced with tags during import. * Bugfix: Import script might have skipped some users due to missing ORDER BY. --- script/import_scripts/mbox/importer.rb | 39 ++++++++++++++++++- script/import_scripts/mbox/settings.yml | 17 ++++++++ .../import_scripts/mbox/support/database.rb | 1 + script/import_scripts/mbox/support/indexer.rb | 32 +-------------- .../import_scripts/mbox/support/settings.rb | 19 +++++++++ 5 files changed, 76 insertions(+), 32 deletions(-) diff --git a/script/import_scripts/mbox/importer.rb b/script/import_scripts/mbox/importer.rb index 3d823e197df..d4f8ec47945 100644 --- a/script/import_scripts/mbox/importer.rb +++ b/script/import_scripts/mbox/importer.rb @@ -30,6 +30,8 @@ module ImportScripts::Mbox if @settings.index_only @skip_updates = true else + SiteSetting.tagging_enabled = true if @settings.tags.present? + import_categories import_users import_posts @@ -142,9 +144,13 @@ module ImportScripts::Mbox end def map_first_post(row) + subject = row['subject'] + tags = remove_tags!(subject) + mapped = map_post(row) mapped[:category] = category_id_from_imported_category_id(row['category']) - mapped[:title] = row['subject'].strip[0...255] + mapped[:title] = subject.strip[0...255] + mapped[:tags] = tags if tags.present? mapped end @@ -161,6 +167,37 @@ module ImportScripts::Mbox mapped end + def remove_tags!(subject) + tag_names = [] + remove_prefixes!(subject) + + loop do + old_length = subject.length + + @settings.tags.each do |tag| + if subject.sub!(tag[:regex], "") && tag[:name].present? + tag_names << tag[:name] + end + end + + remove_prefixes!(subject) if subject.length != old_length + break if subject.length == old_length + end + + tag_names.uniq + end + + def remove_prefixes!(subject) + # There could be multiple prefixes... + loop do + if subject.sub!(@settings.subject_prefix_regex, "") + subject.strip! + else + break + end + end + end + def create_incoming_email(post, row) IncomingEmail.create( message_id: row['msg_id'], diff --git a/script/import_scripts/mbox/settings.yml b/script/import_scripts/mbox/settings.yml index cac3996a90a..f1188001002 100644 --- a/script/import_scripts/mbox/settings.yml +++ b/script/import_scripts/mbox/settings.yml @@ -16,3 +16,20 @@ staged: true index_only: false group_messages_by_subject: false + +# Remove prefixes like [FOO] or (BAR) from topic titles and replace them with tags. +# You can map one or more case-insensitive prefixes to the same tag in Discourse. +# "Tag name in Discourse": "foo" +# "Tag name in Discourse": ["foo", "bar"] +# You can use an empty tag name to remove prefixes without creating a tag in Discourse. +tags: +# "off-topic": "ot" +# announcement: ["ann", "announce", "announcement"] +# "": ["foo", "bar"] + +# These prefixes will be removed from the beginning of topic titles. You can use regular expessions. +# Prefixes are case-insensitive. You can add additional prefixes (e.g. localized prefixes from +# https://en.wikipedia.org/wiki/List_of_email_subject_abbreviations#Abbreviations_in_other_languages). +remove_subject_prefixes: + - "Re: " + - "Fwd?: " \ No newline at end of file diff --git a/script/import_scripts/mbox/support/database.rb b/script/import_scripts/mbox/support/database.rb index ea1a6600d5b..052f0d479a9 100644 --- a/script/import_scripts/mbox/support/database.rb +++ b/script/import_scripts/mbox/support/database.rb @@ -168,6 +168,7 @@ module ImportScripts::Mbox SELECT email, name, date_of_first_message FROM user WHERE email > :last_email + ORDER BY email LIMIT #{@batch_size} SQL diff --git a/script/import_scripts/mbox/support/indexer.rb b/script/import_scripts/mbox/support/indexer.rb index fa51ef62c42..7017c2a59fd 100644 --- a/script/import_scripts/mbox/support/indexer.rb +++ b/script/import_scripts/mbox/support/indexer.rb @@ -204,37 +204,7 @@ module ImportScripts::Mbox def extract_subject(receiver, list_name) subject = receiver.subject - return nil if subject.blank? - - # TODO: make the list name (or maybe multiple names) configurable - # Strip mailing list name from subject - subject = subject.gsub(/\[#{Regexp.escape(list_name)}\]/i, '').strip - - clean_subject(subject) - end - - # TODO: refactor and move prefixes to settings - def clean_subject(subject) - original_length = subject.length - - # Strip Reply prefix from title (Standard and localized) - subject = subject.gsub(/^Re: */i, '') - subject = subject.gsub(/^R: */i, '') #Italian - subject = subject.gsub(/^RIF: */i, '') #Italian - - # Strip Forward prefix from title (Standard and localized) - subject = subject.gsub(/^Fwd: */i, '') - subject = subject.gsub(/^I: */i, '') #Italian - - subject.strip - - # In case of mixed localized prefixes there could be many of them - # if the mail client didn't strip the localized ones - if original_length > subject.length - clean_subject(subject) - else - subject - end + subject.blank? ? nil : subject.strip end def ignored_file?(path, checksums) diff --git a/script/import_scripts/mbox/support/settings.rb b/script/import_scripts/mbox/support/settings.rb index be8bd68e4cc..f30c5d56d18 100644 --- a/script/import_scripts/mbox/support/settings.rb +++ b/script/import_scripts/mbox/support/settings.rb @@ -17,6 +17,9 @@ module ImportScripts::Mbox attr_reader :staged attr_reader :index_only attr_reader :group_messages_by_subject + attr_reader :subject_prefix_regex + attr_reader :automatically_remove_list_name_prefix + attr_reader :tags def initialize(yaml) @data_dir = yaml['data_dir'] @@ -27,6 +30,22 @@ module ImportScripts::Mbox @staged = yaml['staged'] @index_only = yaml['index_only'] @group_messages_by_subject = yaml['group_messages_by_subject'] + + unless yaml['remove_subject_prefixes'].empty? + prefix_regexes = yaml['remove_subject_prefixes'].map { |p| Regexp.new(p) } + @subject_prefix_regex = /^#{Regexp.union(prefix_regexes).source}/i + end + + @automatically_remove_list_name_prefix = yaml['automatically_remove_list_name_prefix'] + + @tags = [] + yaml['tags'].each do |tag_name, value| + prefixes = Regexp.union(value).source + @tags << { + regex: /^(?:(?:\[(?:#{prefixes})\])|(?:\((?:#{prefixes})\)))\s*/i, + name: tag_name + } + end end end end