Improve Telligent import script

* Imports private messages
* Replaces internal links for topics and replies
* Allows incremental import of accepted answers
This commit is contained in:
Gerhard Schlager 2020-04-03 18:10:52 +02:00
parent 3cfe086a94
commit 2b2584912a
1 changed files with 158 additions and 6 deletions

View File

@ -50,6 +50,7 @@ class ImportScripts::Telligent < ImportScripts::Base
/\[(?<tag>img|url)=[^\[]*?cfs-file(?:systemfile)?(?:\.ashx)?\/__key\/(?<directory>[^\/]+)\/(?<path>[^\/]+)\/(?<filename>.+?)\][^\[]*?\[\/\k<tag>\]/i
]
PROPERTY_NAMES_REGEX ||= /(?<name>\w+):S:(?<start>\d+):(?<length>\d+):/
INTERNAL_LINK_REGEX ||= /\shref=".*?\/f\/\d+(?:(\/t\/(?<topic_id>\d+))|(?:\/p\/\d+\/(?<post_id>\d+))|(?:\/p\/(?<post_id>\d+)\/reply))\.aspx[^"]*?"/i
CATEGORY_LINK_NORMALIZATION = '/.*?(f\/\d+)$/\1'
TOPIC_LINK_NORMALIZATION = '/.*?(f\/\d+\/t\/\d+)$/\1'
@ -109,6 +110,7 @@ class ImportScripts::Telligent < ImportScripts::Base
import_users
import_topics
import_posts
import_messages
mark_topics_as_solved
end
@ -127,7 +129,12 @@ class ImportScripts::Telligent < ImportScripts::Base
WHERE t.UserId = u.UserID) OR
EXISTS(SELECT 1
FROM te_Forum_ThreadReplies r
WHERE r.UserId = u.UserID)
WHERE r.UserId = u.UserID) OR
EXISTS(SELECT 1
FROM cs_Messaging_ConversationParticipants p
JOIN cs_Messaging_ConversationMessages cm ON p.ConversationId = cm.ConversationId
JOIN cs_Messaging_Messages m ON m.MessageId = cm.MessageId
WHERE p.ParticipantId = u.UserID)
)
SQL
@ -508,6 +515,116 @@ class ImportScripts::Telligent < ImportScripts::Base
end
end
def import_messages
puts "", "Importing messages..."
current_conversation_id = ""
current_topic_import_id = ""
last_conversation_id = ""
total_count = count(<<~SQL)
SELECT COUNT(1) AS count
FROM cs_Messaging_Messages m
JOIN cs_Messaging_ConversationMessages cm ON m.MessageId = cm.MessageId
SQL
batches do |offset|
if last_conversation_id.blank?
conditions = ""
else
conditions = <<~SQL
WHERE cm.ConversationId > '#{last_conversation_id}'
SQL
end
rows = query(<<~SQL)
SELECT TOP #{BATCH_SIZE}
cm.ConversationId, m.MessageId, m.AuthorId, m.Subject, m.Body, m.DateCreated,
STUFF((SELECT ';' + CONVERT(VARCHAR, p.ParticipantId)
FROM cs_Messaging_ConversationParticipants p
WHERE p.ConversationId = cm.ConversationId
ORDER BY p.ParticipantId
FOR XML PATH('')), 1, 1, '') AS ParticipantIds
FROM cs_Messaging_Messages m
JOIN cs_Messaging_ConversationMessages cm ON m.MessageId = cm.MessageId
#{conditions}
ORDER BY cm.ConversationId, m.DateCreated, m.MessageId
SQL
break if rows.blank?
last_row = rows[-1]
last_conversation_id = last_row["ConversationId"]
next if all_records_exist?(:post, rows.map { |row| row["MessageId"] })
create_posts(rows, total: total_count, offset: offset) do |row|
user_id = user_id_from_imported_user_id(row["AuthorId"]) || Discourse::SYSTEM_USER_ID
post = {
id: row["MessageId"],
raw: raw_with_attachment(row, user_id, :message),
user_id: user_id,
created_at: row["DateCreated"]
}
if current_conversation_id == row["ConversationId"]
parent_post = topic_lookup_from_imported_post_id(current_topic_import_id)
if parent_post
post[:topic_id] = parent_post[:topic_id]
else
puts "Failed to import message #{row['MessageId']}. Parent was not found."
post = nil
end
else
post[:title] = CGI.unescapeHTML(row["Subject"])
post[:archetype] = Archetype.private_message
post[:target_usernames] = get_recipient_usernames(row)
if post[:target_usernames].empty?
puts "Private message without recipients. Skipping #{row['MessageId']}"
post = nil
end
current_topic_import_id = row["MessageId"]
end
current_conversation_id = row["ConversationId"]
post
end
end
# Mark all imported messages as read
DB.exec(<<~SQL)
UPDATE topic_users tu
SET last_read_post_number = t.highest_post_number,
highest_seen_post_number = t.highest_post_number
FROM topics t
JOIN topic_custom_fields tcf ON t.id = tcf.topic_id
WHERE tu.topic_id = t.id
AND tu.user_id > 0
AND t.archetype = 'private_message'
AND tcf.name = 'import_id'
SQL
end
def get_recipient_user_ids(participant_ids)
return [] if participant_ids.blank?
user_ids = participant_ids.split(';')
user_ids.uniq!
user_ids.map!(&:strip)
end
def get_recipient_usernames(row)
import_user_ids = get_recipient_user_ids(row["ParticipantIds"])
import_user_ids.map! do |import_user_id|
find_user_by_import_id(import_user_id).try(:username)
end.compact
end
def index_directory(root_directory)
Dir.foreach(root_directory) do |directory_name|
next if directory_name == "." || directory_name == ".."
@ -524,7 +641,7 @@ class ImportScripts::Telligent < ImportScripts::Base
end
def raw_with_attachment(row, user_id, type)
raw, embedded_paths, upload_ids = replace_embedded_attachments(row["Body"], user_id)
raw, embedded_paths, upload_ids = replace_embedded_attachments(row, user_id, type)
raw = html_to_markdown(raw) || ""
filename = row["FileName"]
@ -551,15 +668,28 @@ class ImportScripts::Telligent < ImportScripts::Base
raw = "#{raw}\n#{@uploader.html_for_upload(upload, filename)}"
end
else
id = type == :topic ? row['ThreadId'] : row['ThreadReplyId']
STDERR.puts "Could not find file for #{type} #{id}: #{path}"
print_file_not_found_error(type, path, row)
end
end
raw
end
def replace_embedded_attachments(raw, user_id)
def print_file_not_found_error(type, path, row)
case type
when :topic
id = row['ThreadId']
when :post
id = row['ThreadReplyId']
when :message
id = row['MessageId']
end
STDERR.puts "Could not find file for #{type} #{id}: #{path}"
end
def replace_embedded_attachments(row, user_id, type)
raw = row["Body"]
paths = []
upload_ids = []
@ -583,7 +713,7 @@ class ImportScripts::Telligent < ImportScripts::Base
end
else
path = File.join(path, match_data[:filename])
STDERR.puts "Could not find file: #{path}"
print_file_not_found_error(type, path, row)
match_data[0]
end
end
@ -673,6 +803,8 @@ class ImportScripts::Telligent < ImportScripts::Base
def html_to_markdown(html)
return html if html.blank?
html = fix_internal_links(html)
md = HtmlToMarkdown.new(html).to_markdown
md.gsub!(/\[quote.*?\]/, "\n" + '\0' + "\n")
md.gsub!(/(?<!^)\[\/quote\]/, "\n[/quote]\n")
@ -682,6 +814,21 @@ class ImportScripts::Telligent < ImportScripts::Base
md
end
def fix_internal_links(html)
html.gsub(INTERNAL_LINK_REGEX) do
match_data = Regexp.last_match
if match_data[:topic_id].present?
imported_id = import_topic_id(match_data[:topic_id])
else
imported_id = match_data[:post_id]
end
post = topic_lookup_from_imported_post_id(imported_id) if imported_id
post ? %Q| href="#{Discourse.base_url}#{post[:url]}"| : match_data[0]
end
end
def parse_properties(names, values)
properties = {}
return properties if names.blank? || values.blank?
@ -706,6 +853,11 @@ class ImportScripts::Telligent < ImportScripts::Base
FROM post_custom_fields pcf
JOIN posts p ON p.id = pcf.post_id
WHERE pcf.name = 'is_accepted_answer' AND pcf.value = 'true'
AND NOT EXISTS (
SELECT 1
FROM topic_custom_fields x
WHERE x.topic_id = p.topic_id AND x.name = 'accepted_answer_post_id'
)
SQL
end