# frozen_string_literal: true require File.expand_path(File.dirname(__FILE__) + "/base.rb") require 'csv' # Importer for Friends+Me Google+ Exporter (F+MG+E) output. # # Takes the full path (absolute or relative) to # * each of the F+MG+E JSON export files you want to import # * the F+MG+E google-plus-image-list.csv file, # * a categories.json file you write to describe how the Google+ # categories map to Discourse categories, subcategories, and tags. # # You can provide all the F+MG+E JSON export files in a single import # run. This will be the fastest way to do the entire import if you # have enough memory and disk space. It will work just as well to # import each F+MG+E JSON export file separately. This might be # valuable if you have memory or space limitations, as the memory to # hold all the data from the F+MG+E JSON export files is one of the # key resources used by this script. # # Create an initial empty ("{}") categories.json file, and the import # script will write a .new file for you to fill in the details. # You will probably want to use jq to reformat the .new file before # trying to edit it. `jq . categories.json.new > categories.json` # # Provide a filename that ends with "upload-paths.txt" and the names # of each of the files uploaded will be written to the file with that # name # # Edit values at the top of the script to fit your preferences class ImportScripts::FMGP < ImportScripts::Base def initialize super # Set this to the base URL for the site; required for importing videos # typically just 'https:' in production @site_base_url = 'http://localhost:3000' @system_user = Discourse.system_user SiteSetting.max_image_size_kb = 40960 SiteSetting.max_attachment_size_kb = 40960 # handle the same video extension as the rest of Discourse SiteSetting.authorized_extensions = (SiteSetting.authorized_extensions.split("|") + ['mp4', 'mov', 'webm', 'ogv']).uniq.join("|") @invalid_bounce_score = 5.0 @min_title_words = 3 @max_title_words = 14 @min_title_characters = 12 @min_post_raw_characters = 12 # Set to true to create categories in categories.json. Does # not honor parent relationships; expects categories to be # rearranged after import. @create_categories = false # JSON files produced by F+MG+E as an export of a community @feeds = [] # CSV is map to downloaded images and/or videos (exported separately) @images = {} # map from Google ID to local system users where necessary # { # "128465039243871098234": "handle" # } # GoogleID 128465039243871098234 will show up as @handle @usermap = {} # G+ user IDs to filter out (spam, abuse) — no topics or posts, silence and suspend when creating # loaded from blacklist.json as array of google ids `[ 92310293874, 12378491235293 ]` @blacklist = Set[] # G+ user IDs whose posts are useful; if this is set, include only # posts (and non-blacklisted comments) authored by these IDs @whitelist = nil # Tags to apply to every topic; empty Array to not have any tags applied everywhere @globaltags = [ "gplus" ] @imagefiles = nil # categories.json file is map: # "google-category-uuid": { # "name": 'google+ category name', # "category": 'category name', # "parent": 'parent name', # optional # "create": true, # optional # "tags": ['list', 'of', 'tags'] optional # } # Start with '{}', let the script generate categories.json.new once, then edit and re-run @categories = {} # keep track of the filename in case we need to write a .new file @categories_filename = nil # dry run parses but doesn't create @dryrun = false # @last_date cuts off at a certain date, for late-spammed abandoned communities @last_date = nil # @first_date starts at a certain date, for early-spammed rescued communities @first_date = nil # every argument is a filename, do the right thing based on the file name ARGV.each do |arg| if arg.end_with?('.csv') # CSV files produced by F+MG+E have "URL";"IsDownloaded";"FileName";"FilePath";"FileSize" CSV.foreach(arg, headers: true, col_sep: ';') do |row| @images[row[0]] = { filename: row[2], filepath: row[3], filesize: row[4] } end elsif arg.end_with?("upload-paths.txt") @imagefiles = File.open(arg, "w") elsif arg.end_with?('categories.json') @categories_filename = arg @categories = load_fmgp_json(arg) elsif arg.end_with?("usermap.json") @usermap = load_fmgp_json(arg) elsif arg.end_with?('blacklist.json') @blacklist = load_fmgp_json(arg).map { |i| i.to_s }.to_set elsif arg.end_with?('whitelist.json') @whitelist = load_fmgp_json(arg).map { |i| i.to_s }.to_set elsif arg.end_with?('.json') @feeds << load_fmgp_json(arg) elsif arg == '--dry-run' @dryrun = true elsif arg.start_with?("--last-date=") @last_date = Time.zone.parse(arg.gsub(/.*=/, '')) elsif arg.start_with?("--first-date=") @first_date = Time.zone.parse(arg.gsub(/.*=/, '')) else raise RuntimeError.new("unknown argument #{arg}") end end raise RuntimeError.new("Must provide a categories.json file") if @categories_filename.nil? # store the actual category objects looked up in the database @cats = {} # remember google auth DB lookup results @emails = {} @newusers = {} @users = {} # remember uploaded images @uploaded = {} # counters for post progress @topics_imported = 0 @posts_imported = 0 @topics_skipped = 0 @posts_skipped = 0 @topics_blacklisted = 0 @posts_blacklisted = 0 # count uploaded file size @totalsize = 0 end def execute puts "", "Importing from Friends+Me Google+ Exporter..." read_categories check_categories map_categories import_users import_posts # No need to set trust level 0 for any imported users unless F+MG+E gets the # ability to add +1 data, in which case users who have only done a +1 and # neither posted nor commented should be TL0, in which case this should be # called after all other processing done # update_tl0 @imagefiles.close() if !@imagefiles.nil? puts "", "Uploaded #{@totalsize} bytes of image files" puts "", "Done" end def load_fmgp_json(filename) raise RuntimeError.new("File #{filename} not found") if !File.exists?(filename) JSON.parse(File.read(filename)) end def read_categories @feeds.each do |feed| feed["accounts"].each do |account| account["communities"].each do |community| community["categories"].each do |category| if !@categories[category["id"]].present? # Create empty entries to write and fill in manually @categories[category["id"]] = { "name" => category["name"], "community" => community["name"], "category" => "", "parent" => nil, "tags" => [], } elsif !@categories[category["id"]]["community"].present? @categories[category["id"]]["community"] = community["name"] end end end end end end def check_categories # raise a useful exception if necessary data not found in categories.json incomplete_categories = [] @categories.each do |id, c| if !c["category"].present? # written in JSON without a "category" key at all c["category"] = "" end if c["category"].empty? # found in read_categories or not yet filled out in categories.json incomplete_categories << c["name"] end end if !incomplete_categories.empty? categories_new = "#{@categories_filename}.new" File.open(categories_new, "w") do |f| f.write(@categories.to_json) raise RuntimeError.new("Category file missing categories for #{incomplete_categories}, edit #{categories_new} and rename it to #{@category_filename} before running the same import") end end end def map_categories puts "", "Mapping categories from Google+ to Discourse..." @categories.each do |id, cat| if cat["parent"].present? && !cat["parent"].empty? # Two separate sub-categories can have the same name, so need to identify by parent Category.where(name: cat["category"]).each do |category| parent = Category.where(id: category.parent_category_id).first @cats[id] = category if parent.name == cat["parent"] end else if category = Category.where(name: cat["category"]).first @cats[id] = category elsif @create_categories params = {} params[:name] = cat['category'] params[:id] = id puts "Creating #{cat['category']}" category = create_category(params, id) @cats[id] = category end end raise RuntimeError.new("Could not find category #{cat["category"]} for #{cat}") if @cats[id].nil? end end def import_users puts '', "Importing Google+ post and comment author users..." # collect authors of both posts and comments @feeds.each do |feed| feed["accounts"].each do |account| account["communities"].each do |community| community["categories"].each do |category| category["posts"].each do |post| import_author_user(post["author"]) if post["message"].present? import_message_users(post["message"]) end post["comments"].each do |comment| import_author_user(comment["author"]) if comment["message"].present? import_message_users(comment["message"]) end end end end end end end return if @dryrun # now create them all create_users(@newusers) do |id, u| { id: id, email: u[:email], name: u[:name], post_create_action: u[:post_create_action] } end end def import_author_user(author) id = author["id"] name = author["name"] import_google_user(id, name) end def import_message_users(message) message.each do |fragment| if fragment[0] == 3 && !fragment[2].nil? # deleted G+ users show up with a null ID import_google_user(fragment[2], fragment[1]) end end end def import_google_user(id, name) if !@emails[id].present? google_user_info = UserAssociatedAccount.find_by(provider_name: 'google_oauth2', provider_uid: id.to_i) if google_user_info.nil? # create new google user on system; expect this user to merge # when they later log in with google authentication # Note that because email address is not included in G+ data, we # don't know if they already have another account not yet associated # with google ooauth2. If they didn't log in, they'll have an # @gplus.invalid address associated with their account email = "#{id}@gplus.invalid" @newusers[id] = { email: email, name: name, post_create_action: proc do |newuser| newuser.approved = true newuser.approved_by_id = @system_user.id newuser.approved_at = newuser.created_at if @blacklist.include?(id.to_s) now = DateTime.now forever = 1000.years.from_now # you can suspend as well if you want your blacklist to # be hard to recover from #newuser.suspended_at = now #newuser.suspended_till = forever newuser.silenced_till = forever end newuser.save @users[id] = newuser UserAssociatedAccount.create(provider_name: 'google_oauth2', user_id: newuser.id, provider_uid: id) # Do not send email to the invalid email addresses # this can be removed after merging with #7162 s = UserStat.where(user_id: newuser.id).first s.bounce_score = @invalid_bounce_score s.reset_bounce_score_after = 1000.years.from_now s.save end } else # user already on system u = User.find(google_user_info.user_id) if u.silenced? || u.suspended? @blacklist.add(id) end @users[id] = u email = u.email end @emails[id] = email end end def import_posts # "post" is confusing: # - A google+ post is a discourse topic # - A google+ comment is a discourse post puts '', "Importing Google+ posts and comments..." @feeds.each do |feed| feed["accounts"].each do |account| account["communities"].each do |community| community["categories"].each do |category| category["posts"].each do |post| # G+ post / Discourse topic import_topic(post, category) print("\r#{@topics_imported}/#{@posts_imported} topics/posts (skipped: #{@topics_skipped}/#{@posts_skipped} blacklisted: #{@topics_blacklisted}/#{@posts_blacklisted}) ") end end end end end puts '' end def import_topic(post, category) # no parent for discourse topics / G+ posts if topic_id = post_id_from_imported_post_id(post["id"]) # already imported topic; might need to attach more comments/posts p = Post.find_by(id: topic_id) @topics_skipped += 1 else # new post if !@whitelist.nil? && !@whitelist.include?(post["author"]["id"]) # only ignore non-whitelisted if whitelist defined return end postmap = make_postmap(post, category, nil) if postmap.nil? @topics_blacklisted += 1 return end p = create_post(postmap, postmap[:id]) if !@dryrun @topics_imported += 1 end # iterate over comments in post post["comments"].each do |comment| # category is nil for comments if post_id_from_imported_post_id(comment["id"]) @posts_skipped += 1 else commentmap = make_postmap(comment, nil, p) if commentmap.nil? @posts_blacklisted += 1 else @posts_imported += 1 new_comment = create_post(commentmap, commentmap[:id]) if !@dryrun end end end end def make_postmap(post, category, parent) post_author_id = post["author"]["id"] return nil if @blacklist.include?(post_author_id.to_s) raw = formatted_message(post) # if no message, image, or images, it's just empty return nil if raw.length < @min_post_raw_characters created_at = Time.zone.parse(post["createdAt"]) return nil if !@last_date.nil? && created_at > @last_date return nil if !@frst_date.nil? && created_at < @first_date user_id = user_id_from_imported_user_id(post_author_id) if user_id.nil? user_id = @users[post["author"]["id"]].id end mapped = { id: post["id"], user_id: user_id, created_at: created_at, raw: raw, cook_method: Post.cook_methods[:regular], } # nil category for comments, set for posts, so post-only things here if !category.nil? cat_id = category["id"] mapped[:title] = parse_title(post, created_at) mapped[:category] = @cats[cat_id].id mapped[:tags] = Array.new(@globaltags) if @categories[cat_id]["tags"].present? mapped[:tags].append(@categories[cat_id]["tags"]).flatten! end else mapped[:topic_id] = parent.topic_id if !@dryrun end # FIXME: import G+ "+1" as "like" if F+MG+E feature request implemented mapped end def parse_title(post, created_at) # G+ has no titles, so we have to make something up if post["message"].present? title_text(post, created_at) else # probably just posted an image and/or album untitled(post["author"]["name"], created_at) end end def title_text(post, created_at) words = message_text(post["message"]) if words.empty? || words.join("").length < @min_title_characters || words.length < @min_title_words # database has minimum length # short posts appear not to work well as titles most of the time (in practice) return untitled(post["author"]["name"], created_at) end words = words[0..(@max_title_words - 1)] lastword = nil (@min_title_words..(words.length - 1)).each do |i| # prefer full stop if words[i].end_with?(".") lastword = i end end if lastword.nil? # fall back on other punctuation (@min_title_words..(words.length - 1)).each do |i| if words[i].end_with?(',', ';', ':', '?') lastword = i end end end if !lastword.nil? # found a logical terminating word words = words[0..lastword] end # database has max title length, which is longer than a good display shows anyway title = words.join(" ").scan(/.{1,254}/)[0] end def untitled(name, created_at) "Google+ post by #{name} on #{created_at}" end def message_text(message) # only words, no markup words = [] text_types = [0, 3] message.each do |fragment| if text_types.include?(fragment[0]) fragment[1].split().each do |word| words << word end elsif fragment[0] == 2 # use the display text of a link words << fragment[1] end end words end def formatted_message(post) lines = [] urls_seen = Set[] if post["message"].present? post["message"].each do |fragment| lines << formatted_message_fragment(fragment, post, urls_seen) end end # yes, both "image" and "images"; "video" and "videos" :( if post["video"].present? lines << "\n#{formatted_link(post["video"]["proxy"])}\n" elsif post["image"].present? # if both image and video, image is a cover image for the video lines << "\n#{formatted_link(post["image"]["proxy"])}\n" end if post["images"].present? post["images"].each do |image| lines << "\n#{formatted_link(image["proxy"])}\n" end end if post["videos"].present? post["videos"].each do |video| lines << "\n#{formatted_link(video["proxy"])}\n" end end if post["link"].present? && post["link"]["url"].present? url = post["link"]["url"] if !urls_seen.include?(url) # add the URL only if it wasn't already referenced, because # they are often redundant lines << "\n#{post["link"]["url"]}\n" urls_seen.add(url) end end lines.join("") end def formatted_message_fragment(fragment, post, urls_seen) # markdown does not nest reliably the same as either G+'s markup or what users intended in G+, so generate HTML codes # this method uses return to make sure it doesn't fall through accidentally if fragment[0] == 0 # Random zero-width join characters break the output; in particular, they are # common after plus-references and break @name recognition. Just get rid of them. # Also deal with 0x80 (really‽) and non-breaking spaces text = fragment[1].gsub(/(\u200d|\u0080)/, "").gsub(/\u00a0/, " ") if fragment[2].nil? return text else if fragment[2]["italic"].present? text = "#{text}" end if fragment[2]["bold"].present? text = "#{text}" end if fragment[2]["strikethrough"].present? # s more likely than del to represent user intent? text = "#{text}" end return text end elsif fragment[0] == 1 "\n" elsif fragment[0] == 2 urls_seen.add(fragment[2]) formatted_link_text(fragment[2], fragment[1]) elsif fragment[0] == 3 # reference to a user if @usermap.include?(fragment[2].to_s) return "@#{@usermap[fragment[2].to_s]}" end if fragment[2].nil? # deleted G+ users show up with a null ID return "+#{fragment[1]}" end # G+ occasionally doesn't put proper spaces after users if user = find_user_by_import_id(fragment[2]) # user was in this import's authors return "@#{user.username} " else if google_user_info = UserAssociatedAccount.find_by(provider_name: 'google_oauth2', provider_uid: fragment[2]) # user was not in this import, but has logged in or been imported otherwise user = User.find(google_user_info.user_id) return "@#{user.username} " else raise RuntimeError.new("Google user #{fragment[1]} (id #{fragment[2]}) not imported") if !@dryrun # if you want to fall back to their G+ name, just erase the raise above, # but this should not happen return "+#{fragment[1]}" end end elsif fragment[0] == 4 # hashtag, the octothorpe is included fragment[1] else raise RuntimeError.new("message code #{fragment[0]} not recognized!") end end def formatted_link(url) formatted_link_text(url, url) end def embedded_image_md(upload) # remove unnecessary size logic relative to embedded_image_html upload_name = upload.short_url || upload.url if upload_name =~ /\.(mov|mp4|webm|ogv)$/i @site_base_url + upload.url else "![#{upload.original_filename}](#{upload_name})" end end def formatted_link_text(url, text) # two ways to present images attached to posts; you may want to edit this for preference # - display: embedded_image_html(upload) # - download links: attachment_html(upload, text) # you might even want to make it depend on the file name. if @images[text].present? # F+MG+E provides the URL it downloaded in the text slot # we won't use the plus url at all since it will disappear anyway url = text end if @uploaded[url].present? upload = @uploaded[url] return "\n#{embedded_image_md(upload)}" elsif @images[url].present? missing = "missing/deleted image from Google+" return missing if !Pathname.new(@images[url][:filepath]).exist? @imagefiles.write("#{@images[url][:filepath]}\n") if !@imagefiles.nil? upload = create_upload(@system_user.id, @images[url][:filepath], @images[url][:filename]) if upload.nil? || upload.id.nil? # upload can be nil if the image conversion fails # upload.id can be nil for at least videos, and possibly deleted images return missing end upload.save @totalsize += @images[url][:filesize].to_i @uploaded[url] = upload return "\n#{embedded_image_md(upload)}" end if text == url # leave the URL bare and Discourse will do the right thing return url else # It turns out that the only place we get here, google has done its own text # interpolation that doesn't look good on Discourse, so while it looks like # this should be: # return "[#{text}](#{url})" # it actually looks better to throw away the google-provided text: return url end end end if __FILE__ == $0 ImportScripts::FMGP.new.perform end