DEV: Improve mbox import script

* Customizable email subject prefixes to remove "Re" and "Fwd" as well as localized prefixes.
* Configuration option for prefixes like [FOO] or (BAR) which can be replaced with tags during import.
* Bugfix: Import script might have skipped some users due to missing ORDER BY.
This commit is contained in:
Gerhard Schlager 2020-03-08 01:20:02 +01:00
parent edc8d58ac3
commit 0e752db411
5 changed files with 76 additions and 32 deletions

View File

@ -30,6 +30,8 @@ module ImportScripts::Mbox
if @settings.index_only if @settings.index_only
@skip_updates = true @skip_updates = true
else else
SiteSetting.tagging_enabled = true if @settings.tags.present?
import_categories import_categories
import_users import_users
import_posts import_posts
@ -142,9 +144,13 @@ module ImportScripts::Mbox
end end
def map_first_post(row) def map_first_post(row)
subject = row['subject']
tags = remove_tags!(subject)
mapped = map_post(row) mapped = map_post(row)
mapped[:category] = category_id_from_imported_category_id(row['category']) mapped[:category] = category_id_from_imported_category_id(row['category'])
mapped[:title] = row['subject'].strip[0...255] mapped[:title] = subject.strip[0...255]
mapped[:tags] = tags if tags.present?
mapped mapped
end end
@ -161,6 +167,37 @@ module ImportScripts::Mbox
mapped mapped
end end
def remove_tags!(subject)
tag_names = []
remove_prefixes!(subject)
loop do
old_length = subject.length
@settings.tags.each do |tag|
if subject.sub!(tag[:regex], "") && tag[:name].present?
tag_names << tag[:name]
end
end
remove_prefixes!(subject) if subject.length != old_length
break if subject.length == old_length
end
tag_names.uniq
end
def remove_prefixes!(subject)
# There could be multiple prefixes...
loop do
if subject.sub!(@settings.subject_prefix_regex, "")
subject.strip!
else
break
end
end
end
def create_incoming_email(post, row) def create_incoming_email(post, row)
IncomingEmail.create( IncomingEmail.create(
message_id: row['msg_id'], message_id: row['msg_id'],

View File

@ -16,3 +16,20 @@ staged: true
index_only: false index_only: false
group_messages_by_subject: false group_messages_by_subject: false
# Remove prefixes like [FOO] or (BAR) from topic titles and replace them with tags.
# You can map one or more case-insensitive prefixes to the same tag in Discourse.
# "Tag name in Discourse": "foo"
# "Tag name in Discourse": ["foo", "bar"]
# You can use an empty tag name to remove prefixes without creating a tag in Discourse.
tags:
# "off-topic": "ot"
# announcement: ["ann", "announce", "announcement"]
# "": ["foo", "bar"]
# These prefixes will be removed from the beginning of topic titles. You can use regular expessions.
# Prefixes are case-insensitive. You can add additional prefixes (e.g. localized prefixes from
# https://en.wikipedia.org/wiki/List_of_email_subject_abbreviations#Abbreviations_in_other_languages).
remove_subject_prefixes:
- "Re: "
- "Fwd?: "

View File

@ -168,6 +168,7 @@ module ImportScripts::Mbox
SELECT email, name, date_of_first_message SELECT email, name, date_of_first_message
FROM user FROM user
WHERE email > :last_email WHERE email > :last_email
ORDER BY email
LIMIT #{@batch_size} LIMIT #{@batch_size}
SQL SQL

View File

@ -204,37 +204,7 @@ module ImportScripts::Mbox
def extract_subject(receiver, list_name) def extract_subject(receiver, list_name)
subject = receiver.subject subject = receiver.subject
return nil if subject.blank? subject.blank? ? nil : subject.strip
# TODO: make the list name (or maybe multiple names) configurable
# Strip mailing list name from subject
subject = subject.gsub(/\[#{Regexp.escape(list_name)}\]/i, '').strip
clean_subject(subject)
end
# TODO: refactor and move prefixes to settings
def clean_subject(subject)
original_length = subject.length
# Strip Reply prefix from title (Standard and localized)
subject = subject.gsub(/^Re: */i, '')
subject = subject.gsub(/^R: */i, '') #Italian
subject = subject.gsub(/^RIF: */i, '') #Italian
# Strip Forward prefix from title (Standard and localized)
subject = subject.gsub(/^Fwd: */i, '')
subject = subject.gsub(/^I: */i, '') #Italian
subject.strip
# In case of mixed localized prefixes there could be many of them
# if the mail client didn't strip the localized ones
if original_length > subject.length
clean_subject(subject)
else
subject
end
end end
def ignored_file?(path, checksums) def ignored_file?(path, checksums)

View File

@ -17,6 +17,9 @@ module ImportScripts::Mbox
attr_reader :staged attr_reader :staged
attr_reader :index_only attr_reader :index_only
attr_reader :group_messages_by_subject attr_reader :group_messages_by_subject
attr_reader :subject_prefix_regex
attr_reader :automatically_remove_list_name_prefix
attr_reader :tags
def initialize(yaml) def initialize(yaml)
@data_dir = yaml['data_dir'] @data_dir = yaml['data_dir']
@ -27,6 +30,22 @@ module ImportScripts::Mbox
@staged = yaml['staged'] @staged = yaml['staged']
@index_only = yaml['index_only'] @index_only = yaml['index_only']
@group_messages_by_subject = yaml['group_messages_by_subject'] @group_messages_by_subject = yaml['group_messages_by_subject']
unless yaml['remove_subject_prefixes'].empty?
prefix_regexes = yaml['remove_subject_prefixes'].map { |p| Regexp.new(p) }
@subject_prefix_regex = /^#{Regexp.union(prefix_regexes).source}/i
end
@automatically_remove_list_name_prefix = yaml['automatically_remove_list_name_prefix']
@tags = []
yaml['tags'].each do |tag_name, value|
prefixes = Regexp.union(value).source
@tags << {
regex: /^(?:(?:\[(?:#{prefixes})\])|(?:\((?:#{prefixes})\)))\s*/i,
name: tag_name
}
end
end end
end end
end end