DEV: Improve user generic bulk importer anonymization (#27307)

* DEV: Improve user generic bulk importer anonymization

Add support for properly anonymizing:
 - email
 - date_of_birth
 - location
 - website
 - bio

* DEV: Remove uneeded anon username check in `import_user_emails`
This commit is contained in:
Selase Krakani 2024-06-05 11:25:17 +00:00 committed by GitHub
parent c67f810a4b
commit f2c4474c1e
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
1 changed files with 26 additions and 11 deletions

View File

@ -415,17 +415,11 @@ class BulkImport::Generic < BulkImport::Base
end end
if row["anonymized"] == 1 if row["anonymized"] == 1
while true row["username"] = "anon_#{anon_username_suffix}"
anon_suffix = (SecureRandom.random_number * 100_000_000).to_i
break if !@anonymized_user_suffixes.include?(anon_suffix)
end
row["username"] = "anon_#{anon_suffix}"
row["email"] = "#{row["username"]}#{UserAnonymizer::EMAIL_SUFFIX}" row["email"] = "#{row["username"]}#{UserAnonymizer::EMAIL_SUFFIX}"
row["name"] = nil row["name"] = nil
row["registration_ip_address"] = nil row["registration_ip_address"] = nil
row["date_of_birth"] = nil
@anonymized_user_suffixes << anon_suffix
end end
{ {
@ -455,7 +449,7 @@ class BulkImport::Generic < BulkImport::Base
existing_user_ids = UserEmail.pluck(:user_id).to_set existing_user_ids = UserEmail.pluck(:user_id).to_set
users = query(<<~SQL) users = query(<<~SQL)
SELECT id, email, created_at SELECT id, email, created_at, anonymized
FROM users FROM users
ORDER BY id ORDER BY id
SQL SQL
@ -464,6 +458,11 @@ class BulkImport::Generic < BulkImport::Base
user_id = user_id_from_imported_id(row["id"]) user_id = user_id_from_imported_id(row["id"])
next if user_id && existing_user_ids.include?(user_id) next if user_id && existing_user_ids.include?(user_id)
if row["anonymized"] == 1
username = username_from_id(user_id)
row["email"] = "#{username}#{UserAnonymizer::EMAIL_SUFFIX}"
end
{ user_id: user_id, email: row["email"], created_at: to_datetime(row["created_at"]) } { user_id: user_id, email: row["email"], created_at: to_datetime(row["created_at"]) }
end end
@ -474,7 +473,7 @@ class BulkImport::Generic < BulkImport::Base
puts "", "Importing user profiles..." puts "", "Importing user profiles..."
users = query(<<~SQL) users = query(<<~SQL)
SELECT id, bio, location SELECT id, bio, location, website, anonymized
FROM users FROM users
ORDER BY id ORDER BY id
SQL SQL
@ -485,7 +484,13 @@ class BulkImport::Generic < BulkImport::Base
user_id = user_id_from_imported_id(row["id"]) user_id = user_id_from_imported_id(row["id"])
next if user_id && existing_user_ids.include?(user_id) next if user_id && existing_user_ids.include?(user_id)
{ user_id: user_id, bio_raw: row["bio"], location: row["location"] } if row["anonymized"] == 1
row["bio"] = nil
row["location"] = nil
row["website"] = nil
end
{ user_id: user_id, bio_raw: row["bio"], location: row["location"], website: row["website"] }
end end
users.close users.close
@ -2408,6 +2413,16 @@ class BulkImport::Generic < BulkImport::Base
def to_boolean(value) def to_boolean(value)
value == 1 value == 1
end end
def anon_username_suffix
while true
suffix = (SecureRandom.random_number * 100_000_000).to_i
break if @anonymized_user_suffixes.exclude?(suffix)
end
@anonymized_user_suffixes << suffix
suffix
end
end end
BulkImport::Generic.new(ARGV[0], ARGV[1]).start BulkImport::Generic.new(ARGV[0], ARGV[1]).start