DEV: Improve mbox import script
* Customizable email subject prefixes to remove "Re" and "Fwd" as well as localized prefixes. * Configuration option for prefixes like [FOO] or (BAR) which can be replaced with tags during import. * Bugfix: Import script might have skipped some users due to missing ORDER BY.
This commit is contained in:
parent
edc8d58ac3
commit
0e752db411
|
@ -30,6 +30,8 @@ module ImportScripts::Mbox
|
|||
if @settings.index_only
|
||||
@skip_updates = true
|
||||
else
|
||||
SiteSetting.tagging_enabled = true if @settings.tags.present?
|
||||
|
||||
import_categories
|
||||
import_users
|
||||
import_posts
|
||||
|
@ -142,9 +144,13 @@ module ImportScripts::Mbox
|
|||
end
|
||||
|
||||
def map_first_post(row)
|
||||
subject = row['subject']
|
||||
tags = remove_tags!(subject)
|
||||
|
||||
mapped = map_post(row)
|
||||
mapped[:category] = category_id_from_imported_category_id(row['category'])
|
||||
mapped[:title] = row['subject'].strip[0...255]
|
||||
mapped[:title] = subject.strip[0...255]
|
||||
mapped[:tags] = tags if tags.present?
|
||||
mapped
|
||||
end
|
||||
|
||||
|
@ -161,6 +167,37 @@ module ImportScripts::Mbox
|
|||
mapped
|
||||
end
|
||||
|
||||
def remove_tags!(subject)
|
||||
tag_names = []
|
||||
remove_prefixes!(subject)
|
||||
|
||||
loop do
|
||||
old_length = subject.length
|
||||
|
||||
@settings.tags.each do |tag|
|
||||
if subject.sub!(tag[:regex], "") && tag[:name].present?
|
||||
tag_names << tag[:name]
|
||||
end
|
||||
end
|
||||
|
||||
remove_prefixes!(subject) if subject.length != old_length
|
||||
break if subject.length == old_length
|
||||
end
|
||||
|
||||
tag_names.uniq
|
||||
end
|
||||
|
||||
def remove_prefixes!(subject)
|
||||
# There could be multiple prefixes...
|
||||
loop do
|
||||
if subject.sub!(@settings.subject_prefix_regex, "")
|
||||
subject.strip!
|
||||
else
|
||||
break
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
def create_incoming_email(post, row)
|
||||
IncomingEmail.create(
|
||||
message_id: row['msg_id'],
|
||||
|
|
|
@ -16,3 +16,20 @@ staged: true
|
|||
index_only: false
|
||||
|
||||
group_messages_by_subject: false
|
||||
|
||||
# Remove prefixes like [FOO] or (BAR) from topic titles and replace them with tags.
|
||||
# You can map one or more case-insensitive prefixes to the same tag in Discourse.
|
||||
# "Tag name in Discourse": "foo"
|
||||
# "Tag name in Discourse": ["foo", "bar"]
|
||||
# You can use an empty tag name to remove prefixes without creating a tag in Discourse.
|
||||
tags:
|
||||
# "off-topic": "ot"
|
||||
# announcement: ["ann", "announce", "announcement"]
|
||||
# "": ["foo", "bar"]
|
||||
|
||||
# These prefixes will be removed from the beginning of topic titles. You can use regular expessions.
|
||||
# Prefixes are case-insensitive. You can add additional prefixes (e.g. localized prefixes from
|
||||
# https://en.wikipedia.org/wiki/List_of_email_subject_abbreviations#Abbreviations_in_other_languages).
|
||||
remove_subject_prefixes:
|
||||
- "Re: "
|
||||
- "Fwd?: "
|
|
@ -168,6 +168,7 @@ module ImportScripts::Mbox
|
|||
SELECT email, name, date_of_first_message
|
||||
FROM user
|
||||
WHERE email > :last_email
|
||||
ORDER BY email
|
||||
LIMIT #{@batch_size}
|
||||
SQL
|
||||
|
||||
|
|
|
@ -204,37 +204,7 @@ module ImportScripts::Mbox
|
|||
|
||||
def extract_subject(receiver, list_name)
|
||||
subject = receiver.subject
|
||||
return nil if subject.blank?
|
||||
|
||||
# TODO: make the list name (or maybe multiple names) configurable
|
||||
# Strip mailing list name from subject
|
||||
subject = subject.gsub(/\[#{Regexp.escape(list_name)}\]/i, '').strip
|
||||
|
||||
clean_subject(subject)
|
||||
end
|
||||
|
||||
# TODO: refactor and move prefixes to settings
|
||||
def clean_subject(subject)
|
||||
original_length = subject.length
|
||||
|
||||
# Strip Reply prefix from title (Standard and localized)
|
||||
subject = subject.gsub(/^Re: */i, '')
|
||||
subject = subject.gsub(/^R: */i, '') #Italian
|
||||
subject = subject.gsub(/^RIF: */i, '') #Italian
|
||||
|
||||
# Strip Forward prefix from title (Standard and localized)
|
||||
subject = subject.gsub(/^Fwd: */i, '')
|
||||
subject = subject.gsub(/^I: */i, '') #Italian
|
||||
|
||||
subject.strip
|
||||
|
||||
# In case of mixed localized prefixes there could be many of them
|
||||
# if the mail client didn't strip the localized ones
|
||||
if original_length > subject.length
|
||||
clean_subject(subject)
|
||||
else
|
||||
subject
|
||||
end
|
||||
subject.blank? ? nil : subject.strip
|
||||
end
|
||||
|
||||
def ignored_file?(path, checksums)
|
||||
|
|
|
@ -17,6 +17,9 @@ module ImportScripts::Mbox
|
|||
attr_reader :staged
|
||||
attr_reader :index_only
|
||||
attr_reader :group_messages_by_subject
|
||||
attr_reader :subject_prefix_regex
|
||||
attr_reader :automatically_remove_list_name_prefix
|
||||
attr_reader :tags
|
||||
|
||||
def initialize(yaml)
|
||||
@data_dir = yaml['data_dir']
|
||||
|
@ -27,6 +30,22 @@ module ImportScripts::Mbox
|
|||
@staged = yaml['staged']
|
||||
@index_only = yaml['index_only']
|
||||
@group_messages_by_subject = yaml['group_messages_by_subject']
|
||||
|
||||
unless yaml['remove_subject_prefixes'].empty?
|
||||
prefix_regexes = yaml['remove_subject_prefixes'].map { |p| Regexp.new(p) }
|
||||
@subject_prefix_regex = /^#{Regexp.union(prefix_regexes).source}/i
|
||||
end
|
||||
|
||||
@automatically_remove_list_name_prefix = yaml['automatically_remove_list_name_prefix']
|
||||
|
||||
@tags = []
|
||||
yaml['tags'].each do |tag_name, value|
|
||||
prefixes = Regexp.union(value).source
|
||||
@tags << {
|
||||
regex: /^(?:(?:\[(?:#{prefixes})\])|(?:\((?:#{prefixes})\)))\s*/i,
|
||||
name: tag_name
|
||||
}
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
|
|
Loading…
Reference in New Issue