DEV: Improve mbox import script
* Customizable email subject prefixes to remove "Re" and "Fwd" as well as localized prefixes. * Configuration option for prefixes like [FOO] or (BAR) which can be replaced with tags during import. * Bugfix: Import script might have skipped some users due to missing ORDER BY.
This commit is contained in:
parent
edc8d58ac3
commit
0e752db411
|
@ -30,6 +30,8 @@ module ImportScripts::Mbox
|
||||||
if @settings.index_only
|
if @settings.index_only
|
||||||
@skip_updates = true
|
@skip_updates = true
|
||||||
else
|
else
|
||||||
|
SiteSetting.tagging_enabled = true if @settings.tags.present?
|
||||||
|
|
||||||
import_categories
|
import_categories
|
||||||
import_users
|
import_users
|
||||||
import_posts
|
import_posts
|
||||||
|
@ -142,9 +144,13 @@ module ImportScripts::Mbox
|
||||||
end
|
end
|
||||||
|
|
||||||
def map_first_post(row)
|
def map_first_post(row)
|
||||||
|
subject = row['subject']
|
||||||
|
tags = remove_tags!(subject)
|
||||||
|
|
||||||
mapped = map_post(row)
|
mapped = map_post(row)
|
||||||
mapped[:category] = category_id_from_imported_category_id(row['category'])
|
mapped[:category] = category_id_from_imported_category_id(row['category'])
|
||||||
mapped[:title] = row['subject'].strip[0...255]
|
mapped[:title] = subject.strip[0...255]
|
||||||
|
mapped[:tags] = tags if tags.present?
|
||||||
mapped
|
mapped
|
||||||
end
|
end
|
||||||
|
|
||||||
|
@ -161,6 +167,37 @@ module ImportScripts::Mbox
|
||||||
mapped
|
mapped
|
||||||
end
|
end
|
||||||
|
|
||||||
|
def remove_tags!(subject)
|
||||||
|
tag_names = []
|
||||||
|
remove_prefixes!(subject)
|
||||||
|
|
||||||
|
loop do
|
||||||
|
old_length = subject.length
|
||||||
|
|
||||||
|
@settings.tags.each do |tag|
|
||||||
|
if subject.sub!(tag[:regex], "") && tag[:name].present?
|
||||||
|
tag_names << tag[:name]
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
remove_prefixes!(subject) if subject.length != old_length
|
||||||
|
break if subject.length == old_length
|
||||||
|
end
|
||||||
|
|
||||||
|
tag_names.uniq
|
||||||
|
end
|
||||||
|
|
||||||
|
def remove_prefixes!(subject)
|
||||||
|
# There could be multiple prefixes...
|
||||||
|
loop do
|
||||||
|
if subject.sub!(@settings.subject_prefix_regex, "")
|
||||||
|
subject.strip!
|
||||||
|
else
|
||||||
|
break
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
def create_incoming_email(post, row)
|
def create_incoming_email(post, row)
|
||||||
IncomingEmail.create(
|
IncomingEmail.create(
|
||||||
message_id: row['msg_id'],
|
message_id: row['msg_id'],
|
||||||
|
|
|
@ -16,3 +16,20 @@ staged: true
|
||||||
index_only: false
|
index_only: false
|
||||||
|
|
||||||
group_messages_by_subject: false
|
group_messages_by_subject: false
|
||||||
|
|
||||||
|
# Remove prefixes like [FOO] or (BAR) from topic titles and replace them with tags.
|
||||||
|
# You can map one or more case-insensitive prefixes to the same tag in Discourse.
|
||||||
|
# "Tag name in Discourse": "foo"
|
||||||
|
# "Tag name in Discourse": ["foo", "bar"]
|
||||||
|
# You can use an empty tag name to remove prefixes without creating a tag in Discourse.
|
||||||
|
tags:
|
||||||
|
# "off-topic": "ot"
|
||||||
|
# announcement: ["ann", "announce", "announcement"]
|
||||||
|
# "": ["foo", "bar"]
|
||||||
|
|
||||||
|
# These prefixes will be removed from the beginning of topic titles. You can use regular expessions.
|
||||||
|
# Prefixes are case-insensitive. You can add additional prefixes (e.g. localized prefixes from
|
||||||
|
# https://en.wikipedia.org/wiki/List_of_email_subject_abbreviations#Abbreviations_in_other_languages).
|
||||||
|
remove_subject_prefixes:
|
||||||
|
- "Re: "
|
||||||
|
- "Fwd?: "
|
|
@ -168,6 +168,7 @@ module ImportScripts::Mbox
|
||||||
SELECT email, name, date_of_first_message
|
SELECT email, name, date_of_first_message
|
||||||
FROM user
|
FROM user
|
||||||
WHERE email > :last_email
|
WHERE email > :last_email
|
||||||
|
ORDER BY email
|
||||||
LIMIT #{@batch_size}
|
LIMIT #{@batch_size}
|
||||||
SQL
|
SQL
|
||||||
|
|
||||||
|
|
|
@ -204,37 +204,7 @@ module ImportScripts::Mbox
|
||||||
|
|
||||||
def extract_subject(receiver, list_name)
|
def extract_subject(receiver, list_name)
|
||||||
subject = receiver.subject
|
subject = receiver.subject
|
||||||
return nil if subject.blank?
|
subject.blank? ? nil : subject.strip
|
||||||
|
|
||||||
# TODO: make the list name (or maybe multiple names) configurable
|
|
||||||
# Strip mailing list name from subject
|
|
||||||
subject = subject.gsub(/\[#{Regexp.escape(list_name)}\]/i, '').strip
|
|
||||||
|
|
||||||
clean_subject(subject)
|
|
||||||
end
|
|
||||||
|
|
||||||
# TODO: refactor and move prefixes to settings
|
|
||||||
def clean_subject(subject)
|
|
||||||
original_length = subject.length
|
|
||||||
|
|
||||||
# Strip Reply prefix from title (Standard and localized)
|
|
||||||
subject = subject.gsub(/^Re: */i, '')
|
|
||||||
subject = subject.gsub(/^R: */i, '') #Italian
|
|
||||||
subject = subject.gsub(/^RIF: */i, '') #Italian
|
|
||||||
|
|
||||||
# Strip Forward prefix from title (Standard and localized)
|
|
||||||
subject = subject.gsub(/^Fwd: */i, '')
|
|
||||||
subject = subject.gsub(/^I: */i, '') #Italian
|
|
||||||
|
|
||||||
subject.strip
|
|
||||||
|
|
||||||
# In case of mixed localized prefixes there could be many of them
|
|
||||||
# if the mail client didn't strip the localized ones
|
|
||||||
if original_length > subject.length
|
|
||||||
clean_subject(subject)
|
|
||||||
else
|
|
||||||
subject
|
|
||||||
end
|
|
||||||
end
|
end
|
||||||
|
|
||||||
def ignored_file?(path, checksums)
|
def ignored_file?(path, checksums)
|
||||||
|
|
|
@ -17,6 +17,9 @@ module ImportScripts::Mbox
|
||||||
attr_reader :staged
|
attr_reader :staged
|
||||||
attr_reader :index_only
|
attr_reader :index_only
|
||||||
attr_reader :group_messages_by_subject
|
attr_reader :group_messages_by_subject
|
||||||
|
attr_reader :subject_prefix_regex
|
||||||
|
attr_reader :automatically_remove_list_name_prefix
|
||||||
|
attr_reader :tags
|
||||||
|
|
||||||
def initialize(yaml)
|
def initialize(yaml)
|
||||||
@data_dir = yaml['data_dir']
|
@data_dir = yaml['data_dir']
|
||||||
|
@ -27,6 +30,22 @@ module ImportScripts::Mbox
|
||||||
@staged = yaml['staged']
|
@staged = yaml['staged']
|
||||||
@index_only = yaml['index_only']
|
@index_only = yaml['index_only']
|
||||||
@group_messages_by_subject = yaml['group_messages_by_subject']
|
@group_messages_by_subject = yaml['group_messages_by_subject']
|
||||||
|
|
||||||
|
unless yaml['remove_subject_prefixes'].empty?
|
||||||
|
prefix_regexes = yaml['remove_subject_prefixes'].map { |p| Regexp.new(p) }
|
||||||
|
@subject_prefix_regex = /^#{Regexp.union(prefix_regexes).source}/i
|
||||||
|
end
|
||||||
|
|
||||||
|
@automatically_remove_list_name_prefix = yaml['automatically_remove_list_name_prefix']
|
||||||
|
|
||||||
|
@tags = []
|
||||||
|
yaml['tags'].each do |tag_name, value|
|
||||||
|
prefixes = Regexp.union(value).source
|
||||||
|
@tags << {
|
||||||
|
regex: /^(?:(?:\[(?:#{prefixes})\])|(?:\((?:#{prefixes})\)))\s*/i,
|
||||||
|
name: tag_name
|
||||||
|
}
|
||||||
|
end
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
Loading…
Reference in New Issue