FIX: improvements for vanilla bulk import (#10212)
Adjustments to the base: 1. PG connection doesn't require host - it was broken on import droplet 2. Drop `topic_reply_count` - it was removed here - https://github.com/discourse/discourse/blob/master/db/post_migrate/20200513185052_drop_topic_reply_count.rb 3. Error with `backtrace.join("\n")` -> `e.backtrace.join("\n")` 4. Correctly link the user and avatar to quote block Adjustments to vanilla: 1. Top-level Vanilla categories are valid categories 2. Posts have `format` column which should be used to decide if the format is HTML or Markdown 3. Remove no UTF8 characters 4. Remove not supported HTML elements like `font` `span` `sub` `u`
This commit is contained in:
parent
cc01297f1f
commit
93ff54e184
|
@ -76,7 +76,7 @@ class BulkImport::Base
|
|||
charset = ENV["DB_CHARSET"] || "utf8"
|
||||
db = ActiveRecord::Base.connection_config
|
||||
@encoder = PG::TextEncoder::CopyRow.new
|
||||
@raw_connection = PG.connect(dbname: db[:database], host: db[:host_names]&.first, port: db[:port])
|
||||
@raw_connection = PG.connect(dbname: db[:database], port: db[:port])
|
||||
@uploader = ImportScripts::Uploader.new
|
||||
@html_entities = HTMLEntities.new
|
||||
@encoding = CHARSET_MAP[charset]
|
||||
|
@ -283,7 +283,7 @@ class BulkImport::Base
|
|||
|
||||
USER_STAT_COLUMNS ||= %i{
|
||||
user_id topics_entered time_read days_visited posts_read_count
|
||||
likes_given likes_received topic_reply_count new_since read_faq
|
||||
likes_given likes_received new_since read_faq
|
||||
first_post_created_at post_count topic_count bounce_score
|
||||
reset_bounce_score_after
|
||||
}
|
||||
|
@ -441,14 +441,12 @@ class BulkImport::Base
|
|||
|
||||
def process_user_stat(user_stat)
|
||||
user_stat[:user_id] = @users[user_stat[:imported_user_id].to_i]
|
||||
user_stat[:topic_reply_count] = user_stat[:post_count] - user_stat[:topic_count]
|
||||
user_stat[:topics_entered] ||= 0
|
||||
user_stat[:time_read] ||= 0
|
||||
user_stat[:days_visited] ||= 0
|
||||
user_stat[:posts_read_count] ||= 0
|
||||
user_stat[:likes_given] ||= 0
|
||||
user_stat[:likes_received] ||= 0
|
||||
user_stat[:topic_reply_count] ||= 0
|
||||
user_stat[:new_since] ||= NOW
|
||||
user_stat[:post_count] ||= 0
|
||||
user_stat[:topic_count] ||= 0
|
||||
|
@ -546,7 +544,8 @@ class BulkImport::Base
|
|||
topic_tag
|
||||
end
|
||||
|
||||
def process_raw(raw)
|
||||
def process_raw(original_raw)
|
||||
raw = original_raw.dup
|
||||
# fix whitespaces
|
||||
raw.gsub!(/(\\r)?\\n/, "\n")
|
||||
raw.gsub!("\\t", "\t")
|
||||
|
@ -699,7 +698,7 @@ class BulkImport::Base
|
|||
rescue => e
|
||||
puts "\n"
|
||||
puts "ERROR: #{e.message}"
|
||||
puts backtrace.join("\n")
|
||||
puts e.backtrace.join("\n")
|
||||
end
|
||||
end
|
||||
end
|
||||
|
@ -782,17 +781,25 @@ class BulkImport::Base
|
|||
quote.gsub!(/^(<br>\n?)+/, "")
|
||||
quote.gsub!(/(<br>\n?)+$/, "")
|
||||
|
||||
user = User.find_by(username: username)
|
||||
|
||||
if post_id.present? && topic_id.present?
|
||||
<<-HTML
|
||||
<aside class="quote" data-post="#{post_id}" data-topic="#{topic_id}">
|
||||
<div class="title">#{username}:</div>
|
||||
<div class="title">
|
||||
<div class="quote-controls"></div>
|
||||
#{user ? user_avatar(user) : username}:
|
||||
</div>
|
||||
<blockquote>#{quote}</blockquote>
|
||||
</aside>
|
||||
HTML
|
||||
else
|
||||
<<-HTML
|
||||
<aside class="quote">
|
||||
<div class="title">#{username}:</div>
|
||||
<aside class="quote no-group" data-username="#{username}">
|
||||
<div class="title">
|
||||
<div class="quote-controls"></div>
|
||||
#{user ? user_avatar(user) : username}:
|
||||
</div>
|
||||
<blockquote>#{quote}</blockquote>
|
||||
</aside>
|
||||
HTML
|
||||
|
@ -802,6 +809,11 @@ class BulkImport::Base
|
|||
cooked.scrub.strip
|
||||
end
|
||||
|
||||
def user_avatar(user)
|
||||
url = user.avatar_template.gsub("{size}", "45")
|
||||
"<img alt=\"\" width=\"20\" height=\"20\" src=\"#{url}\" class=\"avatar\"> #{user.username}"
|
||||
end
|
||||
|
||||
def pre_fancy(title)
|
||||
Redcarpet::Render::SmartyPants.render(ERB::Util.html_escape(title)).scrub.strip
|
||||
end
|
||||
|
|
|
@ -188,7 +188,7 @@ class BulkImport::Vanilla < BulkImport::Base
|
|||
now = Time.zone.now
|
||||
|
||||
create_user_stats(users) do |row|
|
||||
next unless @users[row['UserID'].to_s] # shouldn't need this but it can be NULL :<
|
||||
next unless @users[row['UserID'].to_i] # shouldn't need this but it can be NULL :<
|
||||
|
||||
{
|
||||
imported_id: row['UserID'],
|
||||
|
@ -371,9 +371,8 @@ class BulkImport::Vanilla < BulkImport::Base
|
|||
|
||||
# Throw the -1 level categories away since they contain no topics.
|
||||
# Use the next level as root categories.
|
||||
root_category_ids = Set.new(categories.select { |c| c["ParentCategoryID"] == -1 }.map { |c| c["CategoryID"] })
|
||||
|
||||
top_level_categories = categories.select { |c| root_category_ids.include?(c["ParentCategoryID"]) }
|
||||
top_level_categories = categories.select { |c| c["ParentCategoryID"].blank? || c['ParentCategoryID'] == -1 }
|
||||
|
||||
# Depth = 2
|
||||
create_categories(top_level_categories) do |category|
|
||||
|
@ -432,13 +431,13 @@ class BulkImport::Vanilla < BulkImport::Base
|
|||
def import_topics
|
||||
puts "", "Importing topics..."
|
||||
|
||||
topics_sql = "SELECT DiscussionID, CategoryID, Name, Body, DateInserted, InsertUserID, Announce
|
||||
topics_sql = "SELECT DiscussionID, CategoryID, Name, Body, DateInserted, InsertUserID, Announce, Format
|
||||
FROM #{TABLE_PREFIX}Discussion
|
||||
WHERE DiscussionID > #{@last_imported_topic_id}
|
||||
ORDER BY DiscussionID ASC"
|
||||
|
||||
create_topics(mysql_stream(topics_sql)) do |row|
|
||||
{
|
||||
data = {
|
||||
imported_id: row["DiscussionID"],
|
||||
title: normalize_text(row["Name"]),
|
||||
category_id: category_id_from_imported_id(row["CategoryID"]) ||
|
||||
|
@ -447,18 +446,20 @@ class BulkImport::Vanilla < BulkImport::Base
|
|||
created_at: Time.zone.at(row['DateInserted']),
|
||||
pinned_at: row['Announce'] == 0 ? nil : Time.zone.at(row['DateInserted'])
|
||||
}
|
||||
(data[:user_id].present? && data[:title].present?) ? data : false
|
||||
end
|
||||
|
||||
puts "", "importing first posts..."
|
||||
|
||||
create_posts(mysql_stream(topics_sql)) do |row|
|
||||
{
|
||||
data = {
|
||||
imported_id: "d-" + row['DiscussionID'].to_s,
|
||||
topic_id: topic_id_from_imported_id(row["DiscussionID"]),
|
||||
topic_id: topic_id_from_imported_id(row['DiscussionID']),
|
||||
user_id: user_id_from_imported_id(row["InsertUserID"]),
|
||||
created_at: Time.zone.at(row['DateInserted']),
|
||||
raw: clean_up(row["Body"])
|
||||
raw: clean_up(row['Body'], row['Format'])
|
||||
}
|
||||
data[:topic_id].present? ? data : false
|
||||
end
|
||||
|
||||
puts '', 'converting deep categories to tags...'
|
||||
|
@ -477,7 +478,7 @@ class BulkImport::Vanilla < BulkImport::Base
|
|||
puts "", "Importing posts..."
|
||||
|
||||
posts = mysql_stream(
|
||||
"SELECT CommentID, DiscussionID, Body, DateInserted, InsertUserID
|
||||
"SELECT CommentID, DiscussionID, Body, DateInserted, InsertUserID, Format
|
||||
FROM #{TABLE_PREFIX}Comment
|
||||
WHERE CommentID > #{@last_imported_post_id}
|
||||
ORDER BY CommentID ASC")
|
||||
|
@ -489,9 +490,9 @@ class BulkImport::Vanilla < BulkImport::Base
|
|||
{
|
||||
imported_id: row['CommentID'],
|
||||
topic_id: topic_id,
|
||||
user_id: user_id_from_imported_id(row["InsertUserID"]),
|
||||
user_id: user_id_from_imported_id(row['InsertUserID']),
|
||||
created_at: Time.zone.at(row['DateInserted']),
|
||||
raw: clean_up(row["Body"])
|
||||
raw: clean_up(row['Body'], row['Format'])
|
||||
}
|
||||
end
|
||||
end
|
||||
|
@ -572,7 +573,7 @@ class BulkImport::Vanilla < BulkImport::Base
|
|||
puts "", "importing private replies..."
|
||||
|
||||
private_posts_sql = "
|
||||
SELECT ConversationID, MessageID, Body, InsertUserID, DateInserted
|
||||
SELECT ConversationID, MessageID, Body, InsertUserID, DateInserted, Format
|
||||
FROM GDN_ConversationMessage
|
||||
WHERE ConversationID > #{@last_imported_private_topic_id - PRIVATE_OFFSET}
|
||||
ORDER BY ConversationID ASC, MessageID ASC"
|
||||
|
@ -585,7 +586,7 @@ class BulkImport::Vanilla < BulkImport::Base
|
|||
topic_id: topic_id,
|
||||
user_id: user_id_from_imported_id(row['InsertUserID']),
|
||||
created_at: Time.zone.at(row['DateInserted']),
|
||||
raw: clean_up(row['Body'])
|
||||
raw: clean_up(row['Body'], row['Format'])
|
||||
}
|
||||
end
|
||||
end
|
||||
|
@ -650,13 +651,48 @@ class BulkImport::Vanilla < BulkImport::Base
|
|||
end
|
||||
end
|
||||
|
||||
def clean_up(raw)
|
||||
# post id is sometimes prefixed with "c-"
|
||||
raw.gsub!(/\[QUOTE="([^;]+);c-(\d+)"\]/i) { "[QUOTE=#{$1};#{$2}]" }
|
||||
raw = raw.delete("\u0000")
|
||||
raw = process_raw_text(raw)
|
||||
def clean_up(raw, format)
|
||||
raw.encode!("utf-8", "utf-8", invalid: :replace, undef: :replace, replace: "")
|
||||
|
||||
raw
|
||||
raw.gsub!(/<(.+)> <\/\1>/, "\n\n")
|
||||
|
||||
html =
|
||||
if format == 'Html'
|
||||
raw
|
||||
else
|
||||
markdown = Redcarpet::Markdown.new(Redcarpet::Render::HTML, autolink: true, tables: true)
|
||||
markdown.render(raw)
|
||||
end
|
||||
|
||||
doc = Nokogiri::HTML5.fragment(html)
|
||||
|
||||
doc.css("blockquote").each do |bq|
|
||||
name = bq["rel"]
|
||||
user = User.find_by(name: name)
|
||||
bq.replace %{<br>[QUOTE="#{user&.username || name}"]\n#{bq.inner_html}\n[/QUOTE]<br>}
|
||||
end
|
||||
|
||||
doc.css("font").reverse.each do |f|
|
||||
f.replace f.inner_html
|
||||
end
|
||||
|
||||
doc.css("span").reverse.each do |f|
|
||||
f.replace f.inner_html
|
||||
end
|
||||
|
||||
doc.css("sub").reverse.each do |f|
|
||||
f.replace f.inner_html
|
||||
end
|
||||
|
||||
doc.css("u").reverse.each do |f|
|
||||
f.replace f.inner_html
|
||||
end
|
||||
|
||||
markdown = format == 'Html' ? ReverseMarkdown.convert(doc.to_html) : doc.to_html
|
||||
markdown.gsub!(/\[QUOTE="([^;]+);c-(\d+)"\]/i) { "[QUOTE=#{$1};#{$2}]" }
|
||||
|
||||
markdown = process_raw_text(markdown)
|
||||
markdown
|
||||
end
|
||||
|
||||
def process_raw_text(raw)
|
||||
|
|
Loading…
Reference in New Issue