DEV: Improve generic import script (#25972)

* FEATURE: Import into `category_users` table
* FIX: Failed to import `user_options` unless `timezone` was set
* FIX: Prevent reusing original `id` from intermediate DB in `user_fields`
* FEATURE: Order posts by `post_nuber` if available
* FEATURE: Allow `[mention]` placeholder to reference users by"id" or "name" (username)
* FEATURE: Support `[quote]` placeholders in posts
* FEATURE: Support `[link]` placeholders in posts
* FEATURE: Support all kinds of permalinks and remove support for `old_relative_url`
* PERF: Speed up pre-cooking by removing DB lookups
This commit is contained in:
Gerhard Schlager 2024-03-05 22:23:36 +01:00 committed by GitHub
parent 5c1147adf3
commit bc98740205
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 246 additions and 119 deletions

View File

@ -224,7 +224,7 @@ class BulkImport::Base
def load_indexes def load_indexes
puts "Loading groups indexes..." puts "Loading groups indexes..."
@last_group_id = last_id(Group) @last_group_id = last_id(Group)
group_names = Group.unscoped.pluck(:name).map(&:downcase).to_set @group_names_lower = Group.unscoped.pluck(:name).map(&:downcase).to_set
puts "Loading users indexes..." puts "Loading users indexes..."
@last_user_id = last_id(User) @last_user_id = last_id(User)
@ -232,7 +232,7 @@ class BulkImport::Base
@last_sso_record_id = last_id(SingleSignOnRecord) @last_sso_record_id = last_id(SingleSignOnRecord)
@emails = UserEmail.pluck(:email, :user_id).to_h @emails = UserEmail.pluck(:email, :user_id).to_h
@external_ids = SingleSignOnRecord.pluck(:external_id, :user_id).to_h @external_ids = SingleSignOnRecord.pluck(:external_id, :user_id).to_h
@usernames_and_groupnames_lower = User.unscoped.pluck(:username_lower).to_set.merge(group_names) @usernames_lower = User.unscoped.pluck(:username_lower).to_set
@anonymized_user_suffixes = @anonymized_user_suffixes =
DB.query_single( DB.query_single(
"SELECT SUBSTRING(username_lower, 5)::BIGINT FROM users WHERE username_lower ~* '^anon\\d+$'", "SELECT SUBSTRING(username_lower, 5)::BIGINT FROM users WHERE username_lower ~* '^anon\\d+$'",
@ -245,6 +245,9 @@ class BulkImport::Base
.to_h .to_h
@last_user_avatar_id = last_id(UserAvatar) @last_user_avatar_id = last_id(UserAvatar)
@last_upload_id = last_id(Upload) @last_upload_id = last_id(Upload)
@user_ids_by_username_lower = User.unscoped.pluck(:id, :username_lower).to_h
@usernames_by_id = User.unscoped.pluck(:id, :username).to_h
@user_full_names_by_id = User.unscoped.where("name IS NOT NULL").pluck(:id, :name).to_h
puts "Loading categories indexes..." puts "Loading categories indexes..."
@last_category_id = last_id(Category) @last_category_id = last_id(Category)
@ -354,6 +357,19 @@ class BulkImport::Base
@users[id.to_i] @users[id.to_i]
end end
def user_id_from_original_username(username)
normalized_username = User.normalize_username(@mapped_usernames[username] || username)
@user_ids_by_username_lower[normalized_username]
end
def username_from_id(id)
@usernames_by_id[id]
end
def user_full_name_from_id(id)
@user_full_names_by_id[id]
end
def category_id_from_imported_id(id) def category_id_from_imported_id(id)
@categories[id.to_i] @categories[id.to_i]
end end
@ -547,6 +563,8 @@ class BulkImport::Base
CATEGORY_TAG_GROUP_COLUMNS ||= %i[category_id tag_group_id created_at updated_at] CATEGORY_TAG_GROUP_COLUMNS ||= %i[category_id tag_group_id created_at updated_at]
CATEGORY_USER_COLUMNS ||= %i[category_id user_id notification_level last_seen_at]
TOPIC_COLUMNS ||= %i[ TOPIC_COLUMNS ||= %i[
id id
archetype archetype
@ -745,6 +763,7 @@ class BulkImport::Base
post_id post_id
category_id category_id
tag_id tag_id
user_id
external_url external_url
created_at created_at
updated_at updated_at
@ -824,6 +843,10 @@ class BulkImport::Base
create_records(rows, "category_tag_group", CATEGORY_TAG_GROUP_COLUMNS, &block) create_records(rows, "category_tag_group", CATEGORY_TAG_GROUP_COLUMNS, &block)
end end
def create_category_users(rows, &block)
create_records(rows, "category_user", CATEGORY_USER_COLUMNS, &block)
end
def create_topics(rows, &block) def create_topics(rows, &block)
create_records(rows, "topic", TOPIC_COLUMNS, &block) create_records(rows, "topic", TOPIC_COLUMNS, &block)
end end
@ -925,9 +948,9 @@ class BulkImport::Base
group[:name] = fix_name(group[:name]) group[:name] = fix_name(group[:name])
unless @usernames_and_groupnames_lower.add?(group[:name].downcase) if group_or_user_exist?(group[:name])
group_name = group[:name] + "_1" group_name = group[:name] + "_1"
group_name.next! until @usernames_and_groupnames_lower.add?(group_name.downcase) group_name.next! while group_or_user_exist?(group_name)
group[:name] = group_name group[:name] = group_name
end end
@ -945,6 +968,12 @@ class BulkImport::Base
group group
end end
def group_or_user_exist?(name)
name_lowercase = name.downcase
return true if @usernames_lower.include?(name_lowercase)
@group_names_lower.add?(name_lowercase).nil?
end
def process_user(user) def process_user(user)
if user[:email].present? if user[:email].present?
user[:email].downcase! user[:email].downcase!
@ -976,9 +1005,9 @@ class BulkImport::Base
end end
# unique username_lower # unique username_lower
unless @usernames_and_groupnames_lower.add?(user[:username].downcase) if user_exist?(user[:username])
username = user[:username] + "_1" username = user[:username] + "_1"
username.next! until @usernames_and_groupnames_lower.add?(username.downcase) username.next! while user_exist?(username)
user[:username] = username user[:username] = username
end end
@ -998,9 +1027,18 @@ class BulkImport::Base
user[:date_of_birth] = Date.new(1904, date_of_birth.month, date_of_birth.day) user[:date_of_birth] = Date.new(1904, date_of_birth.month, date_of_birth.day)
end end
@user_ids_by_username_lower[user[:username_lower]] = user[:id]
@usernames_by_id[user[:id]] = user[:username]
@user_full_names_by_id[user[:id]] = user[:name] if user[:name].present?
user user
end end
def user_exist?(username)
username_lowercase = username.downcase
@usernames_lower.add?(username_lowercase).nil?
end
def process_user_email(user_email) def process_user_email(user_email)
user_email[:id] = @last_user_email_id += 1 user_email[:id] = @last_user_email_id += 1
user_email[:primary] = true user_email[:primary] = true
@ -1163,6 +1201,10 @@ class BulkImport::Base
category_tag_group category_tag_group
end end
def process_category_user(category_user)
category_user
end
def process_topic(topic) def process_topic(topic)
@topics[topic[:imported_id].to_i] = topic[:id] = @last_topic_id += 1 @topics[topic[:imported_id].to_i] = topic[:id] = @last_topic_id += 1
topic[:archetype] ||= Archetype.default topic[:archetype] ||= Archetype.default
@ -1682,21 +1724,22 @@ class BulkImport::Base
cooked = @markdown.render(cooked).scrub.strip cooked = @markdown.render(cooked).scrub.strip
cooked.gsub!(%r{\[QUOTE="?([^,"]+)(?:, post:(\d+), topic:(\d+))?"?\](.+?)\[/QUOTE\]}im) do cooked.gsub!(
username, post_id, topic_id, quote = $1, $2, $3, $4 %r{\[QUOTE=(?:"|")?(.+?)(?:, post:(\d+), topic:(\d+))?(?:, username:(.+?))?(?:"|")?\](.+?)\[/QUOTE\]}im,
) do
name_or_username, post_id, topic_id, username, quote = $1, $2, $3, $4, $5
username ||= name_or_username
quote = quote.scrub.strip quote = quote.scrub.strip
quote.gsub!(/^(<br>\n?)+/, "") quote.gsub!(/^(<br>\n?)+/, "")
quote.gsub!(/(<br>\n?)+$/, "") quote.gsub!(/(<br>\n?)+$/, "")
user = User.find_by(username: username)
if post_id.present? && topic_id.present? if post_id.present? && topic_id.present?
<<-HTML <<-HTML
<aside class="quote" data-post="#{post_id}" data-topic="#{topic_id}"> <aside class="quote" data-post="#{post_id}" data-topic="#{topic_id}">
<div class="title"> <div class="title">
<div class="quote-controls"></div> <div class="quote-controls"></div>
#{user ? user_avatar(user) : username}: #{name_or_username}:
</div> </div>
<blockquote>#{quote}</blockquote> <blockquote>#{quote}</blockquote>
</aside> </aside>
@ -1706,7 +1749,7 @@ class BulkImport::Base
<aside class="quote no-group" data-username="#{username}"> <aside class="quote no-group" data-username="#{username}">
<div class="title"> <div class="title">
<div class="quote-controls"></div> <div class="quote-controls"></div>
#{user ? user_avatar(user) : username}: #{name_or_username}:
</div> </div>
<blockquote>#{quote}</blockquote> <blockquote>#{quote}</blockquote>
</aside> </aside>
@ -1726,8 +1769,8 @@ class BulkImport::Base
upload_sha1 = Upload.sha1_from_short_url(short_url) upload_sha1 = Upload.sha1_from_short_url(short_url)
upload_base62 = Upload.base62_sha1(upload_sha1) upload_base62 = Upload.base62_sha1(upload_sha1)
upload_id = @uploads_by_sha1[upload_sha1] upload_id = @uploads_by_sha1[upload_sha1]
upload_url = @upload_urls_by_id[upload_id] upload_url = upload_id ? @upload_urls_by_id[upload_id] : nil
cdn_url = Discourse.store.cdn_url(upload_url) cdn_url = upload_url ? Discourse.store.cdn_url(upload_url) : ""
attributes = +%{loading="lazy"} attributes = +%{loading="lazy"}
attributes << %{ alt="#{alt}"} if alt.present? attributes << %{ alt="#{alt}"} if alt.present?
@ -1744,9 +1787,9 @@ class BulkImport::Base
name = @mapped_usernames[$1] || $1 name = @mapped_usernames[$1] || $1
normalized_name = User.normalize_username(name) normalized_name = User.normalize_username(name)
if User.where(username_lower: normalized_name).exists? if @usernames_lower.include?(normalized_name)
%|<a class="mention" href="/u/#{normalized_name}">@#{name}</a>| %|<a class="mention" href="/u/#{normalized_name}">@#{name}</a>|
elsif Group.where("LOWER(name) = ?", normalized_name).exists? elsif @group_names_lower.include?(normalized_name)
%|<a class="mention-group" href="/groups/#{normalized_name}">@#{name}</a>| %|<a class="mention-group" href="/groups/#{normalized_name}">@#{name}</a>|
else else
"@#{name}" "@#{name}"
@ -1761,7 +1804,8 @@ class BulkImport::Base
def user_avatar(user) def user_avatar(user)
url = user.avatar_template.gsub("{size}", "45") url = user.avatar_template.gsub("{size}", "45")
"<img alt=\"\" width=\"20\" height=\"20\" src=\"#{url}\" class=\"avatar\"> #{user.username}" # TODO name/username preference check
"<img alt=\"\" width=\"20\" height=\"20\" src=\"#{url}\" class=\"avatar\"> #{user.name.presence || user.username}"
end end
def pre_fancy(title) def pre_fancy(title)

View File

@ -72,6 +72,7 @@ class BulkImport::Generic < BulkImport::Base
import_category_custom_fields import_category_custom_fields
import_category_tag_groups import_category_tag_groups
import_category_permissions import_category_permissions
import_category_users
import_topics import_topics
import_posts import_posts
@ -315,6 +316,33 @@ class BulkImport::Generic < BulkImport::Base
permissions.close permissions.close
end end
def import_category_users
puts "", "Importing category users..."
category_users = query(<<~SQL)
SELECT *
FROM category_users
ORDER BY category_id, user_id
SQL
existing_category_user_ids = CategoryUser.pluck(:category_id, :user_id).to_set
create_category_users(category_users) do |row|
category_id = category_id_from_imported_id(row["category_id"])
user_id = user_id_from_imported_id(row["user_id"])
next if existing_category_user_ids.include?([category_id, user_id])
{
category_id: category_id,
user_id: user_id,
notification_level: row["notification_level"],
last_seen_at: to_datetime(row["last_seen_at"]),
}
end
category_users.close
end
def import_groups def import_groups
puts "", "Importing groups..." puts "", "Importing groups..."
@ -465,9 +493,12 @@ class BulkImport::Generic < BulkImport::Base
users = query(<<~SQL) users = query(<<~SQL)
SELECT id, timezone, email_level, email_messages_level, email_digests SELECT id, timezone, email_level, email_messages_level, email_digests
FROM users FROM users
WHERE timezone IS NOT NULL WHERE timezone IS NOT NULL
ORDER BY id OR email_level IS NOT NULL
OR email_messages_level IS NOT NULL
OR email_digests IS NOT NULL
ORDER BY id
SQL SQL
existing_user_ids = UserOption.pluck(:user_id).to_set existing_user_ids = UserOption.pluck(:user_id).to_set
@ -502,6 +533,8 @@ class BulkImport::Generic < BulkImport::Base
user_fields.each do |row| user_fields.each do |row|
next if existing_user_field_names.include?(row["name"]) next if existing_user_field_names.include?(row["name"])
# TODO: Use `id` and store it in mapping table, but for now just ignore it.
row.delete("id")
options = row.delete("options") options = row.delete("options")
field = UserField.create!(row) field = UserField.create!(row)
@ -647,12 +680,10 @@ class BulkImport::Generic < BulkImport::Base
posts = query(<<~SQL) posts = query(<<~SQL)
SELECT * SELECT *
FROM posts FROM posts
ORDER BY topic_id, id ORDER BY topic_id, post_number, id
SQL SQL
group_names = Group.pluck(:id, :name).to_h group_names = Group.pluck(:id, :name).to_h
# TODO: Investigate feasibility of loading all users on large sites
user_names = User.pluck(:id, :username).to_h
create_posts(posts) do |row| create_posts(posts) do |row|
next if row["raw"].blank? next if row["raw"].blank?
@ -667,7 +698,7 @@ class BulkImport::Generic < BulkImport::Base
topic_id: topic_id, topic_id: topic_id,
user_id: user_id_from_imported_id(row["user_id"]), user_id: user_id_from_imported_id(row["user_id"]),
created_at: to_datetime(row["created_at"]), created_at: to_datetime(row["created_at"]),
raw: post_raw(row, group_names, user_names), raw: post_raw(row, group_names),
like_count: row["like_count"], like_count: row["like_count"],
reply_to_post_number: reply_to_post_number:
row["reply_to_post_id"] ? post_number_from_imported_id(row["reply_to_post_id"]) : nil, row["reply_to_post_id"] ? post_number_from_imported_id(row["reply_to_post_id"]) : nil,
@ -677,7 +708,7 @@ class BulkImport::Generic < BulkImport::Base
posts.close posts.close
end end
def post_raw(row, group_names, user_names) def post_raw(row, group_names)
raw = row["raw"] raw = row["raw"]
placeholders = row["placeholders"]&.then { |json| JSON.parse(json) } placeholders = row["placeholders"]&.then { |json| JSON.parse(json) }
@ -706,13 +737,23 @@ class BulkImport::Generic < BulkImport::Base
mentions.each do |mention| mentions.each do |mention|
name = name =
if mention["type"] == "user" if mention["type"] == "user"
user_names[user_id_from_imported_id(mention["id"])] if mention["id"]
username_from_id(user_id_from_imported_id(mention["id"]))
elsif mention["name"]
user_id = user_id_from_original_username(mention["name"])
user_id ? username_from_id(user_id) : mention["name"]
end
elsif mention["type"] == "group" elsif mention["type"] == "group"
group_names[group_id_from_imported_id(mention["id"])] if mention["id"]
group_id = group_id_from_imported_id(mention["id"])
group_id ? group_names[group_id] : mention["name"]
else
mention["name"]
end
end end
puts "#{mention["type"]} not found -- #{mention["id"]}" unless name puts "#{mention["type"]} not found -- #{mention["placeholder"]}" unless name
raw.gsub!(mention["placeholder"], "@#{name}") raw.gsub!(mention["placeholder"], " @#{name} ")
end end
end end
@ -726,6 +767,72 @@ class BulkImport::Generic < BulkImport::Base
raw.gsub!(event["placeholder"], event_bbcode(event_details)) if event_details raw.gsub!(event["placeholder"], event_bbcode(event_details)) if event_details
end end
if (quotes = placeholders&.fetch("quotes", nil))
quotes.each do |quote|
user_id =
if quote["user_id"]
user_id_from_imported_id(quote["user_id"])
elsif quote["username"]
user_id_from_original_username(quote["username"])
end
username = quote["username"]
name = nil
if user_id
username = username_from_id(user_id)
name = user_full_name_from_id(user_id)
end
bbcode =
if username.present? && name.present?
%Q|[quote="#{name}, username:#{username}"]|
elsif username.present?
%Q|[quote="#{username}"]|
else
"[quote]"
end
raw.gsub!(quote["placeholder"], bbcode)
end
end
if (links = placeholders&.fetch("links", nil))
links.each do |link|
text = link["text"]
original_url = link["url"]
markdown =
if link["topic_id"]
topic_id = topic_id_from_imported_id(link["topic_id"])
url = topic_id ? "#{Discourse.base_url}/t/#{topic_id}" : original_url
text ? "[#{text}](#{url})" : url
elsif link["post_id"]
topic_id = topic_id_from_imported_post_id(link["post_id"])
post_number = post_number_from_imported_id(link["post_id"])
url =
(
if topic_id && post_number
"#{Discourse.base_url}/t/#{topic_id}/#{post_number}"
else
original_url
end
)
text ? "[#{text}](#{url})" : url
else
text ? "[#{text}](#{original_url})" : original_url
end
# ensure that the placeholder is surrounded by whitespace unless it's at the beginning or end of the string
placeholder = link["placeholder"]
escaped_placeholder = Regexp.escape(placeholder)
raw.gsub!(/(?<!\s)#{escaped_placeholder}/, " #{placeholder}")
raw.gsub!(/#{escaped_placeholder}(?!\s)/, "#{placeholder} ")
raw.gsub!(placeholder, markdown)
end
end
if row["upload_ids"].present? && @uploads_db if row["upload_ids"].present? && @uploads_db
upload_ids = JSON.parse(row["upload_ids"]) upload_ids = JSON.parse(row["upload_ids"])
upload_ids_placeholders = (["?"] * upload_ids.size).join(",") upload_ids_placeholders = (["?"] * upload_ids.size).join(",")
@ -2061,110 +2168,86 @@ class BulkImport::Generic < BulkImport::Base
end end
def import_permalinks def import_permalinks
puts "", "Importing permalinks for topics..." puts "", "Importing permalinks..."
rows = query(<<~SQL) rows = query(<<~SQL)
SELECT id, old_relative_url SELECT *
FROM topics
WHERE old_relative_url IS NOT NULL
ORDER BY id
SQL
existing_permalinks = Permalink.where("topic_id IS NOT NULL").pluck(:topic_id).to_set
create_permalinks(rows) do |row|
topic_id = topic_id_from_imported_id(row["id"])
next if !topic_id || existing_permalinks.include?(topic_id)
{ url: row["old_relative_url"], topic_id: topic_id }
end
rows.close
puts "", "Importing permalinks for posts..."
rows = query(<<~SQL)
SELECT id, old_relative_url
FROM posts
WHERE old_relative_url IS NOT NULL
ORDER BY id
SQL
existing_permalinks = Permalink.where("post_id IS NOT NULL").pluck(:post_id).to_set
create_permalinks(rows) do |row|
post_id = post_id_from_imported_id(row["id"])
next if !post_id || existing_permalinks.include?(post_id)
{ url: row["old_relative_url"], post_id: post_id }
end
rows.close
puts "", "Importing permalinks for categories..."
rows = query(<<~SQL)
SELECT id, old_relative_url
FROM categories
WHERE old_relative_url IS NOT NULL
ORDER BY id
SQL
existing_permalinks = Permalink.where("category_id IS NOT NULL").pluck(:category_id).to_set
create_permalinks(rows) do |row|
category_id = category_id_from_imported_id(row["id"])
next if !category_id || existing_permalinks.include?(category_id)
{ url: row["old_relative_url"], category_id: category_id }
end
rows.close
if @tag_mapping
puts "", "Importing permalinks for tags..."
rows = query(<<~SQL)
SELECT id, old_relative_url
FROM tags
WHERE old_relative_url IS NOT NULL
ORDER BY id
SQL
existing_permalinks = Permalink.where("tag_id IS NOT NULL").pluck(:tag_id).to_set
create_permalinks(rows) do |row|
tag_id = @tag_mapping[row["id"]]
next if !tag_id || existing_permalinks.include?(tag_id)
{ url: row["old_relative_url"], tag_id: tag_id }
end
rows.close
else
puts " Skipping import of topic tags because tags have not been imported."
end
puts "", "Importing permalinks for external/relative URLs..."
rows = query(<<~SQL)
SELECT url, external_url
FROM permalinks FROM permalinks
WHERE external_url IS NOT NULL
ORDER BY url ORDER BY url
SQL SQL
existing_permalinks = Permalink.where("external_url IS NOT NULL").pluck(:external_url).to_set existing_permalinks = Permalink.pluck(:url).to_set
if !@tag_mapping
puts "Skipping import of permalinks for tags because tags have not been imported."
end
create_permalinks(rows) do |row| create_permalinks(rows) do |row|
next if existing_permalinks.include?(row["external_url"]) next if existing_permalinks.include?(row["url"])
{ url: row["url"], external_url: row["external_url"] } if row["topic_id"]
topic_id = topic_id_from_imported_id(row["topic_id"])
next unless topic_id
{ url: row["url"], topic_id: topic_id }
elsif row["post_id"]
post_id = post_id_from_imported_id(row["post_id"])
next unless post_id
{ url: row["url"], post_id: post_id }
elsif row["category_id"]
category_id = category_id_from_imported_id(row["category_id"])
next unless category_id
{ url: row["url"], category_id: category_id }
elsif row["tag_id"]
next unless @tag_mapping
tag_id = @tag_mapping[row["tag_id"]]
next unless tag_id
{ url: row["url"], tag_id: tag_id }
elsif row["user_id"]
user_id = user_id_from_imported_id(row["user_id"])
next unless user_id
{ url: row["url"], user_id: user_id }
elsif row["external_url"]
external_url = calculate_external_url(row)
next unless external_url
{ url: row["url"], external_url: external_url }
end
end end
rows.close rows.close
end end
def calculate_external_url(row)
external_url = row["external_url"]
placeholders = row["external_url_placeholders"]&.then { |json| JSON.parse(json) }
return external_url unless placeholders
placeholders.each do |placeholder|
case placeholder["type"]
when "category_url"
category_id = category_id_from_imported_id(placeholder["id"])
category = Category.find(category_id)
external_url.gsub!(
placeholder["placeholder"],
"c/#{category.slug_path.join("/")}/#{category.id}",
)
when "category_slug_ref"
category_id = category_id_from_imported_id(placeholder["id"])
category = Category.find(category_id)
external_url.gsub!(placeholder["placeholder"], category.slug_ref)
when "tag_name"
if @tag_mapping
tag_id = @tag_mapping[placeholder["id"]]
tag = Tag.find(tag_id)
external_url.gsub!(placeholder["placeholder"], tag.name)
end
else
raise "Unknown placeholder type: #{placeholder[:type]}"
end
end
external_url
end
def create_connection(path) def create_connection(path)
sqlite = SQLite3::Database.new(path, results_as_hash: true) sqlite = SQLite3::Database.new(path, results_as_hash: true)
sqlite.busy_timeout = 60_000 # 60 seconds sqlite.busy_timeout = 60_000 # 60 seconds