discourse/script/import_scripts/friendsmegplus.rb

# frozen_string_literal: true

require File.expand_path(File.dirname(__FILE__) + "/base.rb")

require "csv"

# Importer for Friends+Me Google+ Exporter (F+MG+E) output.
#
# Takes the full path (absolute or relative) to
# * each of the F+MG+E JSON export files you want to import
# * the F+MG+E google-plus-image-list.csv file,
# * a categories.json file you write to describe how the Google+
#   categories map to Discourse categories, subcategories, and tags.
#
# You can provide all the F+MG+E JSON export files in a single import
# run.  This will be the fastest way to do the entire import if you
# have enough memory and disk space.  It will work just as well to
# import each F+MG+E JSON export file separately.  This might be
# valuable if you have memory or space limitations, as the memory to
# hold all the data from the F+MG+E JSON export files is one of the
# key resources used by this script.
#
# Create an initial empty ("{}") categories.json file, and the import
# script will write a .new file for you to fill in the details.
# You will probably want to use jq to reformat the .new file before
# trying to edit it.  `jq . categories.json.new > categories.json`
#
# Provide a filename that ends with "upload-paths.txt" and the names
# of each of the files uploaded will be written to the file with that
# name
#
# Edit values at the top of the script to fit your preferences

class ImportScripts::FMGP < ImportScripts::Base
  def initialize
    super

    # Set this to the base URL for the site; required for importing videos
    # typically just 'https:' in production
    @site_base_url = "http://localhost:3000"
    @system_user = Discourse.system_user
    SiteSetting.max_image_size_kb = 40_960
    SiteSetting.max_attachment_size_kb = 40_960
    # handle the same video extension as the rest of Discourse
    SiteSetting.authorized_extensions =
      (SiteSetting.authorized_extensions.split("|") + %w[mp4 mov webm ogv]).uniq.join("|")
    @invalid_bounce_score = 5.0
    @min_title_words = 3
    @max_title_words = 14
    @min_title_characters = 12
    @min_post_raw_characters = 12
    # Set to true to create categories in categories.json.  Does
    # not honor parent relationships; expects categories to be
    # rearranged after import.
    @create_categories = false

    # JSON files produced by F+MG+E as an export of a community
    @feeds = []

    # CSV is map to downloaded images and/or videos (exported separately)
    @images = {}

    # map from Google ID to local system users where necessary
    # {
    #   "128465039243871098234": "handle"
    # }
    # GoogleID 128465039243871098234 will show up as @handle
    @usermap = {}

    # G+ user IDs to filter out (spam, abuse) — no topics or posts, silence and suspend when creating
    # loaded from blocklist.json as array of google ids `[ 92310293874, 12378491235293 ]`
    @blocklist = Set[]

    # G+ user IDs whose posts are useful; if this is set, include only
    # posts (and non-blocklisted comments) authored by these IDs
    @allowlist = nil

    # Tags to apply to every topic; empty Array to not have any tags applied everywhere
    @globaltags = ["gplus"]

    @imagefiles = nil

    # categories.json file is map:
    # "google-category-uuid": {
    #   "name": 'google+ category name',
    #   "category": 'category name',
    #   "parent": 'parent name', # optional
    #   "create": true, # optional
    #   "tags": ['list', 'of', 'tags'] optional
    # }
    # Start with '{}', let the script generate categories.json.new once, then edit and re-run
    @categories = {}

    # keep track of the filename in case we need to write a .new file
    @categories_filename = nil
    # dry run parses but doesn't create
    @dryrun = false
    # @last_date cuts off at a certain date, for late-spammed abandoned communities
    @last_date = nil
    # @first_date starts at a certain date, for early-spammed rescued communities
    @first_date = nil
    # every argument is a filename, do the right thing based on the file name
    ARGV.each do |arg|
      if arg.end_with?(".csv")
        # CSV files produced by F+MG+E have "URL";"IsDownloaded";"FileName";"FilePath";"FileSize"
        CSV.foreach(arg, headers: true, col_sep: ";") do |row|
          @images[row[0]] = { filename: row[2], filepath: row[3], filesize: row[4] }
        end
      elsif arg.end_with?("upload-paths.txt")
        @imagefiles = File.open(arg, "w")
      elsif arg.end_with?("categories.json")
        @categories_filename = arg
        @categories = load_fmgp_json(arg)
      elsif arg.end_with?("usermap.json")
        @usermap = load_fmgp_json(arg)
      elsif arg.end_with?("blocklist.json")
        @blocklist = load_fmgp_json(arg).map { |i| i.to_s }.to_set
      elsif arg.end_with?("allowlist.json")
        @allowlist = load_fmgp_json(arg).map { |i| i.to_s }.to_set
      elsif arg.end_with?(".json")
        @feeds << load_fmgp_json(arg)
      elsif arg == "--dry-run"
        @dryrun = true
      elsif arg.start_with?("--last-date=")
        @last_date = Time.zone.parse(arg.gsub(/.*=/, ""))
      elsif arg.start_with?("--first-date=")
        @first_date = Time.zone.parse(arg.gsub(/.*=/, ""))
      else
        raise RuntimeError.new("unknown argument #{arg}")
      end
    end

    raise RuntimeError.new("Must provide a categories.json file") if @categories_filename.nil?

    # store the actual category objects looked up in the database
    @cats = {}
    # remember google auth DB lookup results
    @emails = {}
    @newusers = {}
    @users = {}
    # remember uploaded images
    @uploaded = {}
    # counters for post progress
    @topics_imported = 0
    @posts_imported = 0
    @topics_skipped = 0
    @posts_skipped = 0
    @blocked_topics = 0
    @blocked_posts = 0
    # count uploaded file size
    @totalsize = 0
  end

  def execute
    puts "", "Importing from Friends+Me Google+ Exporter..."

    read_categories
    check_categories
    map_categories

    import_users
    import_posts

    # No need to set trust level 0 for any imported users unless F+MG+E gets the
    # ability to add +1 data, in which case users who have only done a +1 and
    # neither posted nor commented should be TL0, in which case this should be
    # called after all other processing done
    # update_tl0

    @imagefiles.close() if !@imagefiles.nil?
    puts "", "Uploaded #{@totalsize} bytes of image files"
    puts "", "Done"
  end

  def load_fmgp_json(filename)
    raise RuntimeError.new("File #{filename} not found") if !File.exist?(filename)
    JSON.parse(File.read(filename))
  end

  def read_categories
    @feeds.each do |feed|
      feed["accounts"].each do |account|
        account["communities"].each do |community|
          community["categories"].each do |category|
            if !@categories[category["id"]].present?
              # Create empty entries to write and fill in manually
              @categories[category["id"]] = {
                "name" => category["name"],
                "community" => community["name"],
                "category" => "",
                "parent" => nil,
                "tags" => [],
              }
            elsif !@categories[category["id"]]["community"].present?
              @categories[category["id"]]["community"] = community["name"]
            end
          end
        end
      end
    end
  end

  def check_categories
    # raise a useful exception if necessary data not found in categories.json
    incomplete_categories = []
    @categories.each do |id, c|
      if !c["category"].present?
        # written in JSON without a "category" key at all
        c["category"] = ""
      end
      if c["category"].empty?
        # found in read_categories or not yet filled out in categories.json
        incomplete_categories << c["name"]
      end
    end
    if !incomplete_categories.empty?
      categories_new = "#{@categories_filename}.new"
      File.open(categories_new, "w") do |f|
        f.write(@categories.to_json)
        raise RuntimeError.new(
                "Category file missing categories for #{incomplete_categories}, edit #{categories_new} and rename it to #{@category_filename} before running the same import",
              )
      end
    end
  end

  def map_categories
    puts "", "Mapping categories from Google+ to Discourse..."

    @categories.each do |id, cat|
      if cat["parent"].present? && !cat["parent"].empty?
        # Two separate sub-categories can have the same name, so need to identify by parent
        Category
          .where(name: cat["category"])
          .each do |category|
            parent = Category.where(id: category.parent_category_id).first
            @cats[id] = category if parent.name == cat["parent"]
          end
      else
        if category = Category.where(name: cat["category"]).first
          @cats[id] = category
        elsif @create_categories
          params = {}
          params[:name] = cat["category"]
          params[:id] = id
          puts "Creating #{cat["category"]}"
          category = create_category(params, id)
          @cats[id] = category
        end
      end
      if @cats[id].nil?
        raise RuntimeError.new("Could not find category #{cat["category"]} for #{cat}")
      end
    end
  end

  def import_users
    puts "", "Importing Google+ post and comment author users..."

    # collect authors of both posts and comments
    @feeds.each do |feed|
      feed["accounts"].each do |account|
        account["communities"].each do |community|
          community["categories"].each do |category|
            category["posts"].each do |post|
              import_author_user(post["author"])
              import_message_users(post["message"]) if post["message"].present?
              post["comments"].each do |comment|
                import_author_user(comment["author"])
                import_message_users(comment["message"]) if comment["message"].present?
              end
            end
          end
        end
      end
    end

    return if @dryrun

    # now create them all
    create_users(@newusers) do |id, u|
      { id: id, email: u[:email], name: u[:name], post_create_action: u[:post_create_action] }
    end
  end

  def import_author_user(author)
    id = author["id"]
    name = author["name"]
    import_google_user(id, name)
  end

  def import_message_users(message)
    message.each do |fragment|
      if fragment[0] == 3 && !fragment[2].nil?
        # deleted G+ users show up with a null ID
        import_google_user(fragment[2], fragment[1])
      end
    end
  end

  def import_google_user(id, name)
    if !@emails[id].present?
      google_user_info =
        UserAssociatedAccount.find_by(provider_name: "google_oauth2", provider_uid: id.to_i)
      if google_user_info.nil?
        # create new google user on system; expect this user to merge
        # when they later log in with google authentication
        # Note that because email address is not included in G+ data, we
        # don't know if they already have another account not yet associated
        # with google ooauth2. If they didn't log in, they'll have an
        # @gplus.invalid address associated with their account
        email = "#{id}@gplus.invalid"
        @newusers[id] = {
          email: email,
          name: name,
          post_create_action:
            proc do |newuser|
              newuser.approved = true
              newuser.approved_by_id = @system_user.id
              newuser.approved_at = newuser.created_at
              if @blocklist.include?(id.to_s)
                now = DateTime.now
                forever = 1000.years.from_now
                # you can suspend as well if you want your blocklist to
                # be hard to recover from
                #newuser.suspended_at = now
                #newuser.suspended_till = forever
                newuser.silenced_till = forever
              end
              newuser.save
              @users[id] = newuser
              UserAssociatedAccount.create(
                provider_name: "google_oauth2",
                user_id: newuser.id,
                provider_uid: id,
              )
              # Do not send email to the invalid email addresses
              # this can be removed after merging with #7162
              s = UserStat.where(user_id: newuser.id).first
              s.bounce_score = @invalid_bounce_score
              s.reset_bounce_score_after = 1000.years.from_now
              s.save
            end,
        }
      else
        # user already on system
        u = User.find(google_user_info.user_id)
        @blocklist.add(id) if u.silenced? || u.suspended?
        @users[id] = u
        email = u.email
      end
      @emails[id] = email
    end
  end

  def import_posts
    # "post" is confusing:
    # - A google+ post is a discourse topic
    # - A google+ comment is a discourse post

    puts "", "Importing Google+ posts and comments..."

    @feeds.each do |feed|
      feed["accounts"].each do |account|
        account["communities"].each do |community|
          community["categories"].each do |category|
            category["posts"].each do |post|
              # G+ post / Discourse topic
              import_topic(post, category)
              print(
                "\r#{@topics_imported}/#{@posts_imported} topics/posts (skipped: #{@topics_skipped}/#{@posts_skipped} blocklisted: #{@blocked_topics}/#{@blocked_posts})       ",
              )
            end
          end
        end
      end
    end

    puts ""
  end

  def import_topic(post, category)
    # no parent for discourse topics / G+ posts
    if topic_id = post_id_from_imported_post_id(post["id"])
      # already imported topic; might need to attach more comments/posts
      p = Post.find_by(id: topic_id)
      @topics_skipped += 1
    else
      # new post
      if !@allowlist.nil? && !@allowlist.include?(post["author"]["id"])
        # only ignore non-allowlisted if allowlist defined
        return
      end
      postmap = make_postmap(post, category, nil)
      if postmap.nil?
        @blocked_topics += 1
        return
      end
      p = create_post(postmap, postmap[:id]) if !@dryrun
      @topics_imported += 1
    end
    # iterate over comments in post
    post["comments"].each do |comment|
      # category is nil for comments
      if post_id_from_imported_post_id(comment["id"])
        @posts_skipped += 1
      else
        commentmap = make_postmap(comment, nil, p)
        if commentmap.nil?
          @blocked_posts += 1
        else
          @posts_imported += 1
          new_comment = create_post(commentmap, commentmap[:id]) if !@dryrun
        end
      end
    end
  end

  def make_postmap(post, category, parent)
    post_author_id = post["author"]["id"]
    return nil if @blocklist.include?(post_author_id.to_s)

    raw = formatted_message(post)
    # if no message, image, or images, it's just empty
    return nil if raw.length < @min_post_raw_characters

    created_at = Time.zone.parse(post["createdAt"])
    return nil if !@last_date.nil? && created_at > @last_date
    return nil if !@first_date.nil? && created_at < @first_date

    user_id = user_id_from_imported_user_id(post_author_id)
    user_id = @users[post["author"]["id"]].id if user_id.nil?

    mapped = {
      id: post["id"],
      user_id: user_id,
      created_at: created_at,
      raw: raw,
      cook_method: Post.cook_methods[:regular],
    }

    # nil category for comments, set for posts, so post-only things here
    if !category.nil?
      cat_id = category["id"]
      mapped[:title] = parse_title(post, created_at)
      mapped[:category] = @cats[cat_id].id
      mapped[:tags] = Array.new(@globaltags)
      if @categories[cat_id]["tags"].present?
        mapped[:tags].append(@categories[cat_id]["tags"]).flatten!
      end
    else
      mapped[:topic_id] = parent.topic_id if !@dryrun
    end
    # FIXME: import G+ "+1" as "like" if F+MG+E feature request implemented

    mapped
  end

  def parse_title(post, created_at)
    # G+ has no titles, so we have to make something up
    if post["message"].present?
      title_text(post, created_at)
    else
      # probably just posted an image and/or album
      untitled(post["author"]["name"], created_at)
    end
  end

  def title_text(post, created_at)
    words = message_text(post["message"])
    if words.empty? || words.join("").length < @min_title_characters ||
         words.length < @min_title_words
      # database has minimum length
      # short posts appear not to work well as titles most of the time (in practice)
      return untitled(post["author"]["name"], created_at)
    end

    words = words[0..(@max_title_words - 1)]
    lastword = nil

    (@min_title_words..(words.length - 1)).each do |i|
      # prefer full stop
      lastword = i if words[i].end_with?(".")
    end

    if lastword.nil?
      # fall back on other punctuation
      (@min_title_words..(words.length - 1)).each do |i|
        lastword = i if words[i].end_with?(",", ";", ":", "?")
      end
    end

    if !lastword.nil?
      # found a logical terminating word
      words = words[0..lastword]
    end

    # database has max title length, which is longer than a good display shows anyway
    title = words.join(" ").scan(/.{1,254}/)[0]
  end

  def untitled(name, created_at)
    "Google+ post by #{name} on #{created_at}"
  end

  def message_text(message)
    # only words, no markup
    words = []
    text_types = [0, 3]
    message.each do |fragment|
      if text_types.include?(fragment[0])
        fragment[1].split().each { |word| words << word }
      elsif fragment[0] == 2
        # use the display text of a link
        words << fragment[1]
      end
    end
    words
  end

  def formatted_message(post)
    lines = []
    urls_seen = Set[]
    if post["message"].present?
      post["message"].each do |fragment|
        lines << formatted_message_fragment(fragment, post, urls_seen)
      end
    end
    # yes, both "image" and "images"; "video" and "videos" :(
    if post["video"].present?
      lines << "\n#{formatted_link(post["video"]["proxy"])}\n"
    elsif post["image"].present?
      # if both image and video, image is a cover image for the video
      lines << "\n#{formatted_link(post["image"]["proxy"])}\n"
    end
    if post["images"].present?
      post["images"].each { |image| lines << "\n#{formatted_link(image["proxy"])}\n" }
    end
    if post["videos"].present?
      post["videos"].each { |video| lines << "\n#{formatted_link(video["proxy"])}\n" }
    end
    if post["link"].present? && post["link"]["url"].present?
      url = post["link"]["url"]
      if !urls_seen.include?(url)
        # add the URL only if it wasn't already referenced, because
        # they are often redundant
        lines << "\n#{post["link"]["url"]}\n"
        urls_seen.add(url)
      end
    end
    lines.join("")
  end

  def formatted_message_fragment(fragment, post, urls_seen)
    # markdown does not nest reliably the same as either G+'s markup or what users intended in G+, so generate HTML codes
    # this method uses return to make sure it doesn't fall through accidentally
    if fragment[0] == 0
      # Random zero-width join characters break the output; in particular, they are
      # common after plus-references and break @name recognition. Just get rid of them.
      # Also deal with 0x80 (really‽) and non-breaking spaces
      text = fragment[1].gsub(/(\u200d|\u0080)/, "").gsub(/\u00a0/, " ")
      if fragment[2].nil?
        text
      else
        text = "<i>#{text}</i>" if fragment[2]["italic"].present?
        text = "<b>#{text}</b>" if fragment[2]["bold"].present?
        if fragment[2]["strikethrough"].present?
          # s more likely than del to represent user intent?
          text = "<s>#{text}</s>"
        end
        text
      end
    elsif fragment[0] == 1
      "\n"
    elsif fragment[0] == 2
      urls_seen.add(fragment[2])
      formatted_link_text(fragment[2], fragment[1])
    elsif fragment[0] == 3
      # reference to a user
      return "@#{@usermap[fragment[2].to_s]}" if @usermap.include?(fragment[2].to_s)
      if fragment[2].nil?
        # deleted G+ users show up with a null ID
        return "<b>+#{fragment[1]}</b>"
      end
      # G+ occasionally doesn't put proper spaces after users
      if user = find_user_by_import_id(fragment[2])
        # user was in this import's authors
        "@#{user.username} "
      else
        if google_user_info =
             UserAssociatedAccount.find_by(
               provider_name: "google_oauth2",
               provider_uid: fragment[2],
             )
          # user was not in this import, but has logged in or been imported otherwise
          user = User.find(google_user_info.user_id)
          "@#{user.username} "
        else
          if !@dryrun
            raise RuntimeError.new("Google user #{fragment[1]} (id #{fragment[2]}) not imported")
          end
          # if you want to fall back to their G+ name, just erase the raise above,
          # but this should not happen
          "<b>+#{fragment[1]}</b>"
        end
      end
    elsif fragment[0] == 4
      # hashtag, the octothorpe is included
      fragment[1]
    else
      raise RuntimeError.new("message code #{fragment[0]} not recognized!")
    end
  end

  def formatted_link(url)
    formatted_link_text(url, url)
  end

  def embedded_image_md(upload)
    # remove unnecessary size logic relative to embedded_image_html
    upload_name = upload.short_url || upload.url
    if upload_name =~ /\.(mov|mp4|webm|ogv)$/i
      @site_base_url + upload.url
    else
      "![#{upload.original_filename}](#{upload_name})"
    end
  end

  def formatted_link_text(url, text)
    # two ways to present images attached to posts; you may want to edit this for preference
    # - display: embedded_image_html(upload)
    # - download links: attachment_html(upload, text)
    # you might even want to make it depend on the file name.
    if @images[text].present?
      # F+MG+E provides the URL it downloaded in the text slot
      # we won't use the plus url at all since it will disappear anyway
      url = text
    end
    if @uploaded[url].present?
      upload = @uploaded[url]
      return "\n#{embedded_image_md(upload)}"
    elsif @images[url].present?
      missing = "<i>missing/deleted image from Google+</i>"
      return missing if !Pathname.new(@images[url][:filepath]).exist?
      @imagefiles.write("#{@images[url][:filepath]}\n") if !@imagefiles.nil?
      upload = create_upload(@system_user.id, @images[url][:filepath], @images[url][:filename])
      if upload.nil? || upload.id.nil?
        # upload can be nil if the image conversion fails
        # upload.id can be nil for at least videos, and possibly deleted images
        return missing
      end
      upload.save
      @totalsize += @images[url][:filesize].to_i
      @uploaded[url] = upload
      return "\n#{embedded_image_md(upload)}"
    end
    if text == url
      # leave the URL bare and Discourse will do the right thing
      url
    else
      # It turns out that the only place we get here, google has done its own text
      # interpolation that doesn't look good on Discourse, so while it looks like
      # this should be:
      # return "[#{text}](#{url})"
      # it actually looks better to throw away the google-provided text:
      url
    end
  end
end

ImportScripts::FMGP.new.perform if __FILE__ == $0