2019-05-02 18:17:27 -04:00
# frozen_string_literal: true
2019-04-23 00:04:09 -04:00
require File . expand_path ( File . dirname ( __FILE__ ) + " /base.rb " )
require 'csv'
# Importer for Friends+Me Google+ Exporter (F+MG+E) output.
#
# Takes the full path (absolute or relative) to
# * each of the F+MG+E JSON export files you want to import
# * the F+MG+E google-plus-image-list.csv file,
# * a categories.json file you write to describe how the Google+
# categories map to Discourse categories, subcategories, and tags.
#
# You can provide all the F+MG+E JSON export files in a single import
# run. This will be the fastest way to do the entire import if you
# have enough memory and disk space. It will work just as well to
# import each F+MG+E JSON export file separately. This might be
# valuable if you have memory or space limitations, as the memory to
# hold all the data from the F+MG+E JSON export files is one of the
# key resources used by this script.
#
# Create an initial empty ("{}") categories.json file, and the import
# script will write a .new file for you to fill in the details.
# You will probably want to use jq to reformat the .new file before
# trying to edit it. `jq . categories.json.new > categories.json`
#
# Provide a filename that ends with "upload-paths.txt" and the names
# of each of the files uploaded will be written to the file with that
# name
#
# Edit values at the top of the script to fit your preferences
class ImportScripts :: FMGP < ImportScripts :: Base
def initialize
super
# Set this to the base URL for the site; required for importing videos
# typically just 'https:' in production
@site_base_url = 'http://localhost:3000'
@system_user = Discourse . system_user
SiteSetting . max_image_size_kb = 40960
SiteSetting . max_attachment_size_kb = 40960
# handle the same video extension as the rest of Discourse
SiteSetting . authorized_extensions = ( SiteSetting . authorized_extensions . split ( " | " ) + [ 'mp4' , 'mov' , 'webm' , 'ogv' ] ) . uniq . join ( " | " )
@invalid_bounce_score = 5 . 0
@min_title_words = 3
@max_title_words = 14
@min_title_characters = 12
@min_post_raw_characters = 12
# Set to true to create categories in categories.json. Does
# not honor parent relationships; expects categories to be
# rearranged after import.
@create_categories = false
# JSON files produced by F+MG+E as an export of a community
@feeds = [ ]
# CSV is map to downloaded images and/or videos (exported separately)
@images = { }
# map from Google ID to local system users where necessary
# {
# "128465039243871098234": "handle"
# }
# GoogleID 128465039243871098234 will show up as @handle
@usermap = { }
# G+ user IDs to filter out (spam, abuse) — no topics or posts, silence and suspend when creating
2020-07-26 20:23:54 -04:00
# loaded from blocklist.json as array of google ids `[ 92310293874, 12378491235293 ]`
@blocklist = Set [ ]
2019-04-23 00:04:09 -04:00
# G+ user IDs whose posts are useful; if this is set, include only
2020-07-26 20:23:54 -04:00
# posts (and non-blocklisted comments) authored by these IDs
@allowlist = nil
2019-04-23 00:04:09 -04:00
# Tags to apply to every topic; empty Array to not have any tags applied everywhere
@globaltags = [ " gplus " ]
@imagefiles = nil
# categories.json file is map:
# "google-category-uuid": {
# "name": 'google+ category name',
# "category": 'category name',
# "parent": 'parent name', # optional
# "create": true, # optional
# "tags": ['list', 'of', 'tags'] optional
# }
# Start with '{}', let the script generate categories.json.new once, then edit and re-run
@categories = { }
# keep track of the filename in case we need to write a .new file
@categories_filename = nil
# dry run parses but doesn't create
@dryrun = false
# @last_date cuts off at a certain date, for late-spammed abandoned communities
@last_date = nil
# @first_date starts at a certain date, for early-spammed rescued communities
@first_date = nil
# every argument is a filename, do the right thing based on the file name
ARGV . each do | arg |
if arg . end_with? ( '.csv' )
# CSV files produced by F+MG+E have "URL";"IsDownloaded";"FileName";"FilePath";"FileSize"
CSV . foreach ( arg , headers : true , col_sep : ';' ) do | row |
@images [ row [ 0 ] ] = {
filename : row [ 2 ] ,
filepath : row [ 3 ] ,
filesize : row [ 4 ]
}
end
elsif arg . end_with? ( " upload-paths.txt " )
@imagefiles = File . open ( arg , " w " )
elsif arg . end_with? ( 'categories.json' )
@categories_filename = arg
@categories = load_fmgp_json ( arg )
elsif arg . end_with? ( " usermap.json " )
@usermap = load_fmgp_json ( arg )
2020-07-26 20:23:54 -04:00
elsif arg . end_with? ( 'blocklist.json' )
@blocklist = load_fmgp_json ( arg ) . map { | i | i . to_s } . to_set
elsif arg . end_with? ( 'allowlist.json' )
@allowlist = load_fmgp_json ( arg ) . map { | i | i . to_s } . to_set
2019-04-23 00:04:09 -04:00
elsif arg . end_with? ( '.json' )
@feeds << load_fmgp_json ( arg )
elsif arg == '--dry-run'
@dryrun = true
elsif arg . start_with? ( " --last-date= " )
@last_date = Time . zone . parse ( arg . gsub ( / .*= / , '' ) )
elsif arg . start_with? ( " --first-date= " )
@first_date = Time . zone . parse ( arg . gsub ( / .*= / , '' ) )
else
raise RuntimeError . new ( " unknown argument #{ arg } " )
end
end
raise RuntimeError . new ( " Must provide a categories.json file " ) if @categories_filename . nil?
# store the actual category objects looked up in the database
@cats = { }
# remember google auth DB lookup results
@emails = { }
@newusers = { }
@users = { }
# remember uploaded images
@uploaded = { }
# counters for post progress
@topics_imported = 0
@posts_imported = 0
@topics_skipped = 0
@posts_skipped = 0
2020-07-26 20:23:54 -04:00
@blocked_topics = 0
@blocked_posts = 0
2019-04-23 00:04:09 -04:00
# count uploaded file size
@totalsize = 0
end
def execute
puts " " , " Importing from Friends+Me Google+ Exporter... "
read_categories
check_categories
map_categories
import_users
import_posts
# No need to set trust level 0 for any imported users unless F+MG+E gets the
# ability to add +1 data, in which case users who have only done a +1 and
# neither posted nor commented should be TL0, in which case this should be
# called after all other processing done
# update_tl0
@imagefiles . close ( ) if ! @imagefiles . nil?
puts " " , " Uploaded #{ @totalsize } bytes of image files "
puts " " , " Done "
end
def load_fmgp_json ( filename )
raise RuntimeError . new ( " File #{ filename } not found " ) if ! File . exists? ( filename )
JSON . parse ( File . read ( filename ) )
end
def read_categories
@feeds . each do | feed |
feed [ " accounts " ] . each do | account |
account [ " communities " ] . each do | community |
community [ " categories " ] . each do | category |
if ! @categories [ category [ " id " ] ] . present?
# Create empty entries to write and fill in manually
@categories [ category [ " id " ] ] = {
" name " = > category [ " name " ] ,
" community " = > community [ " name " ] ,
" category " = > " " ,
" parent " = > nil ,
" tags " = > [ ] ,
}
elsif ! @categories [ category [ " id " ] ] [ " community " ] . present?
@categories [ category [ " id " ] ] [ " community " ] = community [ " name " ]
end
end
end
end
end
end
def check_categories
# raise a useful exception if necessary data not found in categories.json
incomplete_categories = [ ]
@categories . each do | id , c |
if ! c [ " category " ] . present?
# written in JSON without a "category" key at all
c [ " category " ] = " "
end
if c [ " category " ] . empty?
# found in read_categories or not yet filled out in categories.json
incomplete_categories << c [ " name " ]
end
end
if ! incomplete_categories . empty?
categories_new = " #{ @categories_filename } .new "
File . open ( categories_new , " w " ) do | f |
f . write ( @categories . to_json )
raise RuntimeError . new ( " Category file missing categories for #{ incomplete_categories } , edit #{ categories_new } and rename it to #{ @category_filename } before running the same import " )
end
end
end
def map_categories
puts " " , " Mapping categories from Google+ to Discourse... "
@categories . each do | id , cat |
if cat [ " parent " ] . present? && ! cat [ " parent " ] . empty?
# Two separate sub-categories can have the same name, so need to identify by parent
Category . where ( name : cat [ " category " ] ) . each do | category |
parent = Category . where ( id : category . parent_category_id ) . first
@cats [ id ] = category if parent . name == cat [ " parent " ]
end
else
if category = Category . where ( name : cat [ " category " ] ) . first
@cats [ id ] = category
elsif @create_categories
params = { }
params [ :name ] = cat [ 'category' ]
params [ :id ] = id
puts " Creating #{ cat [ 'category' ] } "
category = create_category ( params , id )
@cats [ id ] = category
end
end
raise RuntimeError . new ( " Could not find category #{ cat [ " category " ] } for #{ cat } " ) if @cats [ id ] . nil?
end
end
def import_users
puts '' , " Importing Google+ post and comment author users... "
# collect authors of both posts and comments
@feeds . each do | feed |
feed [ " accounts " ] . each do | account |
account [ " communities " ] . each do | community |
community [ " categories " ] . each do | category |
category [ " posts " ] . each do | post |
import_author_user ( post [ " author " ] )
if post [ " message " ] . present?
import_message_users ( post [ " message " ] )
end
post [ " comments " ] . each do | comment |
import_author_user ( comment [ " author " ] )
if comment [ " message " ] . present?
import_message_users ( comment [ " message " ] )
end
end
end
end
end
end
end
return if @dryrun
# now create them all
create_users ( @newusers ) do | id , u |
{
id : id ,
email : u [ :email ] ,
name : u [ :name ] ,
post_create_action : u [ :post_create_action ]
}
end
end
def import_author_user ( author )
id = author [ " id " ]
name = author [ " name " ]
import_google_user ( id , name )
end
def import_message_users ( message )
message . each do | fragment |
if fragment [ 0 ] == 3 && ! fragment [ 2 ] . nil?
# deleted G+ users show up with a null ID
import_google_user ( fragment [ 2 ] , fragment [ 1 ] )
end
end
end
def import_google_user ( id , name )
if ! @emails [ id ] . present?
google_user_info = UserAssociatedAccount . find_by ( provider_name : 'google_oauth2' , provider_uid : id . to_i )
if google_user_info . nil?
# create new google user on system; expect this user to merge
# when they later log in with google authentication
# Note that because email address is not included in G+ data, we
# don't know if they already have another account not yet associated
# with google ooauth2. If they didn't log in, they'll have an
# @gplus.invalid address associated with their account
email = " #{ id } @gplus.invalid "
@newusers [ id ] = {
email : email ,
name : name ,
post_create_action : proc do | newuser |
newuser . approved = true
newuser . approved_by_id = @system_user . id
newuser . approved_at = newuser . created_at
2020-07-26 20:23:54 -04:00
if @blocklist . include? ( id . to_s )
2019-04-23 00:04:09 -04:00
now = DateTime . now
forever = 1000 . years . from_now
2020-07-26 20:23:54 -04:00
# you can suspend as well if you want your blocklist to
2019-04-23 00:04:09 -04:00
# be hard to recover from
#newuser.suspended_at = now
#newuser.suspended_till = forever
newuser . silenced_till = forever
end
newuser . save
@users [ id ] = newuser
UserAssociatedAccount . create ( provider_name : 'google_oauth2' , user_id : newuser . id , provider_uid : id )
# Do not send email to the invalid email addresses
# this can be removed after merging with #7162
s = UserStat . where ( user_id : newuser . id ) . first
s . bounce_score = @invalid_bounce_score
s . reset_bounce_score_after = 1000 . years . from_now
s . save
end
}
else
# user already on system
u = User . find ( google_user_info . user_id )
if u . silenced? || u . suspended?
2020-07-26 20:23:54 -04:00
@blocklist . add ( id )
2019-04-23 00:04:09 -04:00
end
@users [ id ] = u
email = u . email
end
@emails [ id ] = email
end
end
def import_posts
# "post" is confusing:
# - A google+ post is a discourse topic
# - A google+ comment is a discourse post
puts '' , " Importing Google+ posts and comments... "
@feeds . each do | feed |
feed [ " accounts " ] . each do | account |
account [ " communities " ] . each do | community |
community [ " categories " ] . each do | category |
category [ " posts " ] . each do | post |
# G+ post / Discourse topic
import_topic ( post , category )
2020-07-26 20:23:54 -04:00
print ( " \r #{ @topics_imported } / #{ @posts_imported } topics/posts (skipped: #{ @topics_skipped } / #{ @posts_skipped } blocklisted: #{ @blocked_topics } / #{ @blocked_posts } ) " )
2019-04-23 00:04:09 -04:00
end
end
end
end
end
puts ''
end
def import_topic ( post , category )
# no parent for discourse topics / G+ posts
if topic_id = post_id_from_imported_post_id ( post [ " id " ] )
# already imported topic; might need to attach more comments/posts
p = Post . find_by ( id : topic_id )
@topics_skipped += 1
else
# new post
2020-07-26 20:23:54 -04:00
if ! @allowlist . nil? && ! @allowlist . include? ( post [ " author " ] [ " id " ] )
# only ignore non-allowlisted if allowlist defined
2019-04-23 00:04:09 -04:00
return
end
postmap = make_postmap ( post , category , nil )
if postmap . nil?
2020-07-26 20:23:54 -04:00
@blocked_topics += 1
2019-04-23 00:04:09 -04:00
return
end
p = create_post ( postmap , postmap [ :id ] ) if ! @dryrun
@topics_imported += 1
end
# iterate over comments in post
post [ " comments " ] . each do | comment |
# category is nil for comments
if post_id_from_imported_post_id ( comment [ " id " ] )
@posts_skipped += 1
else
commentmap = make_postmap ( comment , nil , p )
if commentmap . nil?
2020-07-26 20:23:54 -04:00
@blocked_posts += 1
2019-04-23 00:04:09 -04:00
else
@posts_imported += 1
new_comment = create_post ( commentmap , commentmap [ :id ] ) if ! @dryrun
end
end
end
end
def make_postmap ( post , category , parent )
post_author_id = post [ " author " ] [ " id " ]
2020-07-26 20:23:54 -04:00
return nil if @blocklist . include? ( post_author_id . to_s )
2019-04-23 00:04:09 -04:00
raw = formatted_message ( post )
# if no message, image, or images, it's just empty
return nil if raw . length < @min_post_raw_characters
created_at = Time . zone . parse ( post [ " createdAt " ] )
return nil if ! @last_date . nil? && created_at > @last_date
return nil if ! @frst_date . nil? && created_at < @first_date
user_id = user_id_from_imported_user_id ( post_author_id )
if user_id . nil?
user_id = @users [ post [ " author " ] [ " id " ] ] . id
end
mapped = {
id : post [ " id " ] ,
user_id : user_id ,
created_at : created_at ,
raw : raw ,
cook_method : Post . cook_methods [ :regular ] ,
}
# nil category for comments, set for posts, so post-only things here
if ! category . nil?
cat_id = category [ " id " ]
mapped [ :title ] = parse_title ( post , created_at )
mapped [ :category ] = @cats [ cat_id ] . id
mapped [ :tags ] = Array . new ( @globaltags )
if @categories [ cat_id ] [ " tags " ] . present?
mapped [ :tags ] . append ( @categories [ cat_id ] [ " tags " ] ) . flatten!
end
else
mapped [ :topic_id ] = parent . topic_id if ! @dryrun
end
# FIXME: import G+ "+1" as "like" if F+MG+E feature request implemented
2019-11-14 15:10:51 -05:00
mapped
2019-04-23 00:04:09 -04:00
end
def parse_title ( post , created_at )
# G+ has no titles, so we have to make something up
if post [ " message " ] . present?
title_text ( post , created_at )
else
# probably just posted an image and/or album
untitled ( post [ " author " ] [ " name " ] , created_at )
end
end
def title_text ( post , created_at )
words = message_text ( post [ " message " ] )
if words . empty? || words . join ( " " ) . length < @min_title_characters || words . length < @min_title_words
# database has minimum length
# short posts appear not to work well as titles most of the time (in practice)
return untitled ( post [ " author " ] [ " name " ] , created_at )
end
words = words [ 0 .. ( @max_title_words - 1 ) ]
lastword = nil
( @min_title_words .. ( words . length - 1 ) ) . each do | i |
# prefer full stop
if words [ i ] . end_with? ( " . " )
lastword = i
end
end
if lastword . nil?
# fall back on other punctuation
( @min_title_words .. ( words . length - 1 ) ) . each do | i |
if words [ i ] . end_with? ( ',' , ';' , ':' , '?' )
lastword = i
end
end
end
if ! lastword . nil?
# found a logical terminating word
words = words [ 0 .. lastword ]
end
# database has max title length, which is longer than a good display shows anyway
title = words . join ( " " ) . scan ( / .{1,254} / ) [ 0 ]
end
def untitled ( name , created_at )
" Google+ post by #{ name } on #{ created_at } "
end
def message_text ( message )
# only words, no markup
words = [ ]
text_types = [ 0 , 3 ]
message . each do | fragment |
if text_types . include? ( fragment [ 0 ] )
fragment [ 1 ] . split ( ) . each do | word |
words << word
end
elsif fragment [ 0 ] == 2
# use the display text of a link
words << fragment [ 1 ]
end
end
2019-11-14 15:10:51 -05:00
words
2019-04-23 00:04:09 -04:00
end
def formatted_message ( post )
lines = [ ]
urls_seen = Set [ ]
if post [ " message " ] . present?
post [ " message " ] . each do | fragment |
lines << formatted_message_fragment ( fragment , post , urls_seen )
end
end
# yes, both "image" and "images"; "video" and "videos" :(
if post [ " video " ] . present?
lines << " \n #{ formatted_link ( post [ " video " ] [ " proxy " ] ) } \n "
elsif post [ " image " ] . present?
# if both image and video, image is a cover image for the video
lines << " \n #{ formatted_link ( post [ " image " ] [ " proxy " ] ) } \n "
end
if post [ " images " ] . present?
post [ " images " ] . each do | image |
lines << " \n #{ formatted_link ( image [ " proxy " ] ) } \n "
end
end
if post [ " videos " ] . present?
post [ " videos " ] . each do | video |
lines << " \n #{ formatted_link ( video [ " proxy " ] ) } \n "
end
end
if post [ " link " ] . present? && post [ " link " ] [ " url " ] . present?
url = post [ " link " ] [ " url " ]
if ! urls_seen . include? ( url )
# add the URL only if it wasn't already referenced, because
# they are often redundant
lines << " \n #{ post [ " link " ] [ " url " ] } \n "
urls_seen . add ( url )
end
end
lines . join ( " " )
end
def formatted_message_fragment ( fragment , post , urls_seen )
# markdown does not nest reliably the same as either G+'s markup or what users intended in G+, so generate HTML codes
# this method uses return to make sure it doesn't fall through accidentally
if fragment [ 0 ] == 0
# Random zero-width join characters break the output; in particular, they are
# common after plus-references and break @name recognition. Just get rid of them.
# Also deal with 0x80 (really‽) and non-breaking spaces
text = fragment [ 1 ] . gsub ( / ( \ u200d| \ u0080) / , " " ) . gsub ( / \ u00a0 / , " " )
if fragment [ 2 ] . nil?
2019-12-09 19:48:27 -05:00
text
2019-04-23 00:04:09 -04:00
else
if fragment [ 2 ] [ " italic " ] . present?
text = " <i> #{ text } </i> "
end
if fragment [ 2 ] [ " bold " ] . present?
text = " <b> #{ text } </b> "
end
if fragment [ 2 ] [ " strikethrough " ] . present?
# s more likely than del to represent user intent?
text = " <s> #{ text } </s> "
end
2019-12-09 19:48:27 -05:00
text
2019-04-23 00:04:09 -04:00
end
elsif fragment [ 0 ] == 1
2019-11-14 15:10:51 -05:00
" \n "
2019-04-23 00:04:09 -04:00
elsif fragment [ 0 ] == 2
urls_seen . add ( fragment [ 2 ] )
2019-11-14 15:10:51 -05:00
formatted_link_text ( fragment [ 2 ] , fragment [ 1 ] )
2019-04-23 00:04:09 -04:00
elsif fragment [ 0 ] == 3
# reference to a user
if @usermap . include? ( fragment [ 2 ] . to_s )
return " @ #{ @usermap [ fragment [ 2 ] . to_s ] } "
end
if fragment [ 2 ] . nil?
# deleted G+ users show up with a null ID
return " <b>+ #{ fragment [ 1 ] } </b> "
end
# G+ occasionally doesn't put proper spaces after users
if user = find_user_by_import_id ( fragment [ 2 ] )
# user was in this import's authors
2019-12-09 19:48:27 -05:00
" @ #{ user . username } "
2019-04-23 00:04:09 -04:00
else
if google_user_info = UserAssociatedAccount . find_by ( provider_name : 'google_oauth2' , provider_uid : fragment [ 2 ] )
# user was not in this import, but has logged in or been imported otherwise
user = User . find ( google_user_info . user_id )
2019-12-09 19:48:27 -05:00
" @ #{ user . username } "
2019-04-23 00:04:09 -04:00
else
raise RuntimeError . new ( " Google user #{ fragment [ 1 ] } (id #{ fragment [ 2 ] } ) not imported " ) if ! @dryrun
# if you want to fall back to their G+ name, just erase the raise above,
# but this should not happen
2019-12-09 19:48:27 -05:00
" <b>+ #{ fragment [ 1 ] } </b> "
2019-04-23 00:04:09 -04:00
end
end
elsif fragment [ 0 ] == 4
# hashtag, the octothorpe is included
2019-11-14 15:10:51 -05:00
fragment [ 1 ]
2019-04-23 00:04:09 -04:00
else
raise RuntimeError . new ( " message code #{ fragment [ 0 ] } not recognized! " )
end
end
def formatted_link ( url )
formatted_link_text ( url , url )
end
def embedded_image_md ( upload )
# remove unnecessary size logic relative to embedded_image_html
upload_name = upload . short_url || upload . url
if upload_name =~ / \ .(mov|mp4|webm|ogv)$ /i
@site_base_url + upload . url
else
" ![ #{ upload . original_filename } ]( #{ upload_name } ) "
end
end
def formatted_link_text ( url , text )
# two ways to present images attached to posts; you may want to edit this for preference
# - display: embedded_image_html(upload)
# - download links: attachment_html(upload, text)
# you might even want to make it depend on the file name.
if @images [ text ] . present?
# F+MG+E provides the URL it downloaded in the text slot
# we won't use the plus url at all since it will disappear anyway
url = text
end
if @uploaded [ url ] . present?
upload = @uploaded [ url ]
return " \n #{ embedded_image_md ( upload ) } "
elsif @images [ url ] . present?
missing = " <i>missing/deleted image from Google+</i> "
return missing if ! Pathname . new ( @images [ url ] [ :filepath ] ) . exist?
@imagefiles . write ( " #{ @images [ url ] [ :filepath ] } \n " ) if ! @imagefiles . nil?
upload = create_upload ( @system_user . id , @images [ url ] [ :filepath ] , @images [ url ] [ :filename ] )
if upload . nil? || upload . id . nil?
# upload can be nil if the image conversion fails
# upload.id can be nil for at least videos, and possibly deleted images
return missing
end
upload . save
@totalsize += @images [ url ] [ :filesize ] . to_i
@uploaded [ url ] = upload
return " \n #{ embedded_image_md ( upload ) } "
end
if text == url
# leave the URL bare and Discourse will do the right thing
2019-12-09 19:48:27 -05:00
url
2019-04-23 00:04:09 -04:00
else
# It turns out that the only place we get here, google has done its own text
# interpolation that doesn't look good on Discourse, so while it looks like
# this should be:
# return "[#{text}](#{url})"
# it actually looks better to throw away the google-provided text:
2019-12-09 19:48:27 -05:00
url
2019-04-23 00:04:09 -04:00
end
end
end
if __FILE__ == $0
ImportScripts :: FMGP . new . perform
end