discourse/script/import_scripts/google_groups.rb

#!/usr/bin/env ruby
# frozen_string_literal: true

require "bundler/inline"

gemfile(true) do
  source "https://rubygems.org"

  gem "webdrivers"
  gem "colored2"
end

require "fileutils"
require "optparse"
require "set"
require "yaml"

DEFAULT_OUTPUT_PATH = "/shared/import/data"
DEFAULT_COOKIES_TXT = "/shared/import/cookies.txt"

def driver
  @driver ||= begin
    chrome_args = ["disable-gpu"]
    chrome_args << "headless" unless ENV["NOT_HEADLESS"] == '1'
    chrome_args << "no-sandbox" if inside_container?
    options = Selenium::WebDriver::Chrome::Options.new(args: chrome_args)
    Selenium::WebDriver.for(:chrome, options: options)
  end
end

def inside_container?
  File.foreach("/proc/1/cgroup") do |line|
    return true if line.include?("docker")
  end

  false
end

MAX_GET_RETRIES = 5
MAX_FIND_RETRIES = 3

def get(url)
  begin
    retries ||= 0
    driver.get(url)
  rescue Net::ReadTimeout
    sleep retries
    retry if (retries += 1) < MAX_GET_RETRIES
  end
end

def extract(css, parent_element = driver)
  begin
    retries ||= 0
    parent_element.find_elements(css: css).map { |element| yield(element) }
  rescue Net::ReadTimeout, Selenium::WebDriver::Error::StaleElementReferenceError
    sleep retries
    retry if (retries += 1) < MAX_FIND_RETRIES
  end
end

def find(css, parent_element = driver)
  begin
    retries ||= 0
    parent_element.find_element(css: css)
  rescue Net::ReadTimeout, Selenium::WebDriver::Error::ElementNotInteractableError
    sleep retries
    retry if (retries += 1) < MAX_FIND_RETRIES
  end
end

def crawl_categories
  1.step(nil, 100).each do |start|
    url = "https://groups.google.com/forum/?_escaped_fragment_=categories/#{@groupname}[#{start}-#{start + 99}]"
    get(url)

    topic_urls = extract(".subject a[href*='#{@groupname}']") { |a| a["href"].sub("/d/topic/", "/forum/?_escaped_fragment_=topic/") }
    break if topic_urls.size == 0

    topic_urls.each { |topic_url| crawl_topic(topic_url) }
  end
end

def crawl_topic(url)
  if @scraped_topic_urls.include?(url)
    puts "Skipping".green << " #{url}"
    return
  end

  puts "Scraping #{url}"
  get(url)

  extract(".subject a[href*='#{@groupname}']") do |a|
    [
      a["href"].sub("/d/msg/", "/forum/message/raw?msg="),
      a["title"].empty?
    ]
  end.each { |msg_url, might_be_deleted| crawl_message(msg_url, might_be_deleted) }

  @scraped_topic_urls << url
rescue
  puts "Failed to scrape topic at #{url}".red
  raise if @abort_on_error
end

def crawl_message(url, might_be_deleted)
  get(url)

  filename = File.join(@path, "#{url[/#{@groupname}\/(.+)/, 1].sub("/", "-")}.eml")
  content = find("pre")["innerText"]

  if !@first_message_checked
    @first_message_checked = true

    if content.match?(/From:.*\.\.\.@.*/i) && !@force_import
      exit_with_error(<<~MSG.red.bold)
        It looks like you do not have permissions to see email addresses. Aborting.
        Use the --force option to import anyway.
      MSG
    end
  end

  File.write(filename, content)
rescue Selenium::WebDriver::Error::NoSuchElementError
  if might_be_deleted
    puts "Message might be deleted. Skipping #{url}"
  else
    puts "Failed to scrape message at #{url}".red
    raise if @abort_on_error
  end
rescue
  puts "Failed to scrape message at #{url}".red
  raise if @abort_on_error
end

def login
  puts "Logging in..."
  get("https://google.com/404")

  add_cookies(
    "accounts.google.com",
    "myaccount.google.com",
    "google.com"
  )

  get("https://accounts.google.com/servicelogin")

  begin
    wait_for_url { |url| url.start_with?("https://myaccount.google.com") }
  rescue Selenium::WebDriver::Error::TimeoutError
    exit_with_error("Failed to login. Please check the content of your cookies.txt".red.bold)
  end
end

def add_cookies(*domains)
  File.readlines(@cookies).each do |line|
    parts = line.chomp.split("\t")
    next if parts.size != 7 || !domains.any? { |domain| parts[0] =~ /^\.?#{Regexp.escape(domain)}$/ }

    driver.manage.add_cookie(
      domain: parts[0],
      httpOnly: "true".casecmp?(parts[1]),
      path: parts[2],
      secure: "true".casecmp?(parts[3]),
      expires: parts[4] == "0" ? nil : DateTime.strptime(parts[4], "%s"),
      name: parts[5],
      value: parts[6]
    )
  end
end

def wait_for_url
  wait = Selenium::WebDriver::Wait.new(timeout: 5)
  wait.until { yield(driver.current_url) }
end

def exit_with_error(*messages)
  STDERR.puts messages
  exit 1
end

def crawl
  start_time = Time.now
  status_filename = File.join(@path, "status.yml")
  @scraped_topic_urls = File.exists?(status_filename) ? YAML.load_file(status_filename) : Set.new

  login

  begin
    crawl_categories
  ensure
    File.write(status_filename, @scraped_topic_urls.to_yaml)
  end

  elapsed = Time.now - start_time
  puts "", "", "Done (%02dh %02dmin %02dsec)" % [elapsed / 3600, elapsed / 60 % 60, elapsed % 60]
end

def parse_arguments
  puts ""

  # default values
  @force_import = false
  @abort_on_error = false
  @cookies = DEFAULT_COOKIES_TXT if File.exist?(DEFAULT_COOKIES_TXT)

  parser = OptionParser.new do |opts|
    opts.banner = "Usage: google_groups.rb [options]"

    opts.on("-g", "--groupname GROUPNAME") { |v| @groupname = v }
    opts.on("-c", "--cookies PATH", "path to cookies.txt") { |v| @cookies = v }
    opts.on("--path PATH", "output path for emails") { |v| @path = v }
    opts.on("-f", "--force", "force import when user isn't allowed to see email addresses") { @force_import = true }
    opts.on("-a", "--abort-on-error", "abort crawl on error instead of skipping message") { @abort_on_error = true }
    opts.on("-h", "--help") do
      puts opts
      exit
    end
  end

  begin
    parser.parse!
  rescue OptionParser::ParseError => e
    exit_with_error(e.message, "", parser)
  end

  mandatory = [:groupname, :cookies]
  missing = mandatory.select { |name| instance_variable_get("@#{name}").nil? }

  exit_with_error("Missing arguments: #{missing.join(', ')}".red.bold, "", parser, "") if missing.any?
  exit_with_error("cookies.txt not found at #{@cookies}".red.bold, "") if !File.exist?(@cookies)

  @path = File.join(DEFAULT_OUTPUT_PATH, @groupname) if @path.nil?
  FileUtils.mkpath(@path)
end

parse_arguments
crawl
FEATURE: Add download script for Google Groups 2018-10-30 19:31:20 -04:00			`#!/usr/bin/env ruby`
DEV: enable frozen string literal on all files This reduces chances of errors where consumers of strings mutate inputs and reduces memory usage of the app. Test suite passes now, but there may be some stuff left, so we will run a few sites on a branch prior to merging 2019-05-02 18:17:27 -04:00			`# frozen_string_literal: true`
FEATURE: Add download script for Google Groups 2018-10-30 19:31:20 -04:00
			`require "bundler/inline"`

			`gemfile(true) do`
			`source "https://rubygems.org"`

Make Google Groups scraper work with latest chromedriver 2019-03-25 11:10:37 -04:00			`gem "webdrivers"`
FIX: Google Groups crawler failed to login Trying to automate the login into a Google account is quite hard. This makes the crawler use the content of a cookies.txt file instead. It also removes a couple of deprecation warnings and adds some color to the output. 2019-09-18 07:07:07 -04:00			`gem "colored2"`
FEATURE: Add download script for Google Groups 2018-10-30 19:31:20 -04:00			`end`

			`require "fileutils"`
			`require "optparse"`
			`require "set"`
			`require "yaml"`

			`DEFAULT_OUTPUT_PATH = "/shared/import/data"`
FIX: Google Groups crawler failed to login Trying to automate the login into a Google account is quite hard. This makes the crawler use the content of a cookies.txt file instead. It also removes a couple of deprecation warnings and adds some color to the output. 2019-09-18 07:07:07 -04:00			`DEFAULT_COOKIES_TXT = "/shared/import/cookies.txt"`
FEATURE: Add download script for Google Groups 2018-10-30 19:31:20 -04:00
			`def driver`
			`@driver \|\|= begin`
FIX: Google Groups crawler failed to login Trying to automate the login into a Google account is quite hard. This makes the crawler use the content of a cookies.txt file instead. It also removes a couple of deprecation warnings and adds some color to the output. 2019-09-18 07:07:07 -04:00			`chrome_args = ["disable-gpu"]`
			`chrome_args << "headless" unless ENV["NOT_HEADLESS"] == '1'`
Make Google Groups scraper work with latest chromedriver 2019-03-25 11:10:37 -04:00			`chrome_args << "no-sandbox" if inside_container?`
FEATURE: Add download script for Google Groups 2018-10-30 19:31:20 -04:00			`options = Selenium::WebDriver::Chrome::Options.new(args: chrome_args)`
FIX: Latest Selenium gem broke Google Groups import script Selenium uses Keep-Alive since version 3.141, so the net-http-persistent gem shouldn't be needed anymore. 2019-07-10 03:45:25 -04:00			`Selenium::WebDriver.for(:chrome, options: options)`
FEATURE: Add download script for Google Groups 2018-10-30 19:31:20 -04:00			`end`
			`end`

			`def inside_container?`
			`File.foreach("/proc/1/cgroup") do \|line\|`
			`return true if line.include?("docker")`
			`end`

			`false`
			`end`

			`MAX_GET_RETRIES = 5`
			`MAX_FIND_RETRIES = 3`

			`def get(url)`
			`begin`
			`retries \|\|= 0`
			`driver.get(url)`
			`rescue Net::ReadTimeout`
			`sleep retries`
			`retry if (retries += 1) < MAX_GET_RETRIES`
			`end`
			`end`

Make Rubocop happy 2018-10-30 20:30:14 -04:00			`def extract(css, parent_element = driver)`
FEATURE: Add download script for Google Groups 2018-10-30 19:31:20 -04:00			`begin`
			`retries \|\|= 0`
Make Rubocop happy 2018-10-30 20:30:14 -04:00			`parent_element.find_elements(css: css).map { \|element\| yield(element) }`
FEATURE: Add download script for Google Groups 2018-10-30 19:31:20 -04:00			`rescue Net::ReadTimeout, Selenium::WebDriver::Error::StaleElementReferenceError`
			`sleep retries`
			`retry if (retries += 1) < MAX_FIND_RETRIES`
			`end`
			`end`

Make Rubocop happy 2018-10-30 20:30:14 -04:00			`def find(css, parent_element = driver)`
FEATURE: Add download script for Google Groups 2018-10-30 19:31:20 -04:00			`begin`
			`retries \|\|= 0`
Make Rubocop happy 2018-10-30 20:30:14 -04:00			`parent_element.find_element(css: css)`
FIX: Google Groups crawler failed to login Trying to automate the login into a Google account is quite hard. This makes the crawler use the content of a cookies.txt file instead. It also removes a couple of deprecation warnings and adds some color to the output. 2019-09-18 07:07:07 -04:00			`rescue Net::ReadTimeout, Selenium::WebDriver::Error::ElementNotInteractableError`
FEATURE: Add download script for Google Groups 2018-10-30 19:31:20 -04:00			`sleep retries`
			`retry if (retries += 1) < MAX_FIND_RETRIES`
			`end`
			`end`

			`def crawl_categories`
			`1.step(nil, 100).each do \|start\|`
			`url = "https://groups.google.com/forum/?_escaped_fragment_=categories/#{@groupname}[#{start}-#{start + 99}]"`
			`get(url)`

Make Rubocop happy 2018-10-30 20:30:14 -04:00			`topic_urls = extract(".subject a[href*='#{@groupname}']") { \|a\| a["href"].sub("/d/topic/", "/forum/?_escaped_fragment_=topic/") }`
			`break if topic_urls.size == 0`
FEATURE: Add download script for Google Groups 2018-10-30 19:31:20 -04:00
Make Rubocop happy 2018-10-30 20:30:14 -04:00			`topic_urls.each { \|topic_url\| crawl_topic(topic_url) }`
FEATURE: Add download script for Google Groups 2018-10-30 19:31:20 -04:00			`end`
			`end`

			`def crawl_topic(url)`
			`if @scraped_topic_urls.include?(url)`
FIX: Google Groups crawler failed to login Trying to automate the login into a Google account is quite hard. This makes the crawler use the content of a cookies.txt file instead. It also removes a couple of deprecation warnings and adds some color to the output. 2019-09-18 07:07:07 -04:00			`puts "Skipping".green << " #{url}"`
FEATURE: Add download script for Google Groups 2018-10-30 19:31:20 -04:00			`return`
			`end`

			`puts "Scraping #{url}"`
			`get(url)`

			`extract(".subject a[href*='#{@groupname}']") do \|a\|`
			`[`
			`a["href"].sub("/d/msg/", "/forum/message/raw?msg="),`
			`a["title"].empty?`
			`]`
Make Rubocop happy 2018-10-30 20:30:14 -04:00			`end.each { \|msg_url, might_be_deleted\| crawl_message(msg_url, might_be_deleted) }`
FEATURE: Add download script for Google Groups 2018-10-30 19:31:20 -04:00
			`@scraped_topic_urls << url`
			`rescue`
FIX: Google Groups crawler failed to login Trying to automate the login into a Google account is quite hard. This makes the crawler use the content of a cookies.txt file instead. It also removes a couple of deprecation warnings and adds some color to the output. 2019-09-18 07:07:07 -04:00			`puts "Failed to scrape topic at #{url}".red`
FIX: By default, don't abort Google Groups crawling on error 2019-09-18 12:11:52 -04:00			`raise if @abort_on_error`
FEATURE: Add download script for Google Groups 2018-10-30 19:31:20 -04:00			`end`

			`def crawl_message(url, might_be_deleted)`
			`get(url)`

			`filename = File.join(@path, "#{url[/#{@groupname}\/(.+)/, 1].sub("/", "-")}.eml")`
			`content = find("pre")["innerText"]`

Improve Google Groups scraper * Better error detection during login phase * Experimental support for 2FA and SMS codes * Detect missing permissions to scrape email addresses 2019-03-24 18:08:03 -04:00			`if !@first_message_checked`
			`@first_message_checked = true`

			`if content.match?(/From:.\.\.\.@./i) && !@force_import`
FIX: Google Groups crawler failed to login Trying to automate the login into a Google account is quite hard. This makes the crawler use the content of a cookies.txt file instead. It also removes a couple of deprecation warnings and adds some color to the output. 2019-09-18 07:07:07 -04:00			`exit_with_error(<<~MSG.red.bold)`
Improve Google Groups scraper * Better error detection during login phase * Experimental support for 2FA and SMS codes * Detect missing permissions to scrape email addresses 2019-03-24 18:08:03 -04:00			`It looks like you do not have permissions to see email addresses. Aborting.`
			`Use the --force option to import anyway.`
			`MSG`
			`end`
			`end`

FEATURE: Add download script for Google Groups 2018-10-30 19:31:20 -04:00			`File.write(filename, content)`
			`rescue Selenium::WebDriver::Error::NoSuchElementError`
FIX: By default, don't abort Google Groups crawling on error 2019-09-18 12:11:52 -04:00			`if might_be_deleted`
			`puts "Message might be deleted. Skipping #{url}"`
			`else`
			`puts "Failed to scrape message at #{url}".red`
			`raise if @abort_on_error`
			`end`
FEATURE: Add download script for Google Groups 2018-10-30 19:31:20 -04:00			`rescue`
FIX: Google Groups crawler failed to login Trying to automate the login into a Google account is quite hard. This makes the crawler use the content of a cookies.txt file instead. It also removes a couple of deprecation warnings and adds some color to the output. 2019-09-18 07:07:07 -04:00			`puts "Failed to scrape message at #{url}".red`
FIX: By default, don't abort Google Groups crawling on error 2019-09-18 12:11:52 -04:00			`raise if @abort_on_error`
FEATURE: Add download script for Google Groups 2018-10-30 19:31:20 -04:00			`end`

			`def login`
			`puts "Logging in..."`
FIX: Google Groups crawler failed to login Trying to automate the login into a Google account is quite hard. This makes the crawler use the content of a cookies.txt file instead. It also removes a couple of deprecation warnings and adds some color to the output. 2019-09-18 07:07:07 -04:00			`get("https://google.com/404")`

			`add_cookies(`
			`"accounts.google.com",`
			`"myaccount.google.com",`
			`"google.com"`
			`)`

			`get("https://accounts.google.com/servicelogin")`

			`begin`
			`wait_for_url { \|url\| url.start_with?("https://myaccount.google.com") }`
			`rescue Selenium::WebDriver::Error::TimeoutError`
			`exit_with_error("Failed to login. Please check the content of your cookies.txt".red.bold)`
Improve Google Groups scraper * Better error detection during login phase * Experimental support for 2FA and SMS codes * Detect missing permissions to scrape email addresses 2019-03-24 18:08:03 -04:00			`end`
FIX: Google Groups crawler failed to login Trying to automate the login into a Google account is quite hard. This makes the crawler use the content of a cookies.txt file instead. It also removes a couple of deprecation warnings and adds some color to the output. 2019-09-18 07:07:07 -04:00			`end`
Improve Google Groups scraper * Better error detection during login phase * Experimental support for 2FA and SMS codes * Detect missing permissions to scrape email addresses 2019-03-24 18:08:03 -04:00
FIX: Google Groups crawler failed to login Trying to automate the login into a Google account is quite hard. This makes the crawler use the content of a cookies.txt file instead. It also removes a couple of deprecation warnings and adds some color to the output. 2019-09-18 07:07:07 -04:00			`def add_cookies(*domains)`
			`File.readlines(@cookies).each do \|line\|`
			`parts = line.chomp.split("\t")`
			`next if parts.size != 7 \|\| !domains.any? { \|domain\| parts[0] =~ /^\.?#{Regexp.escape(domain)}$/ }`

			`driver.manage.add_cookie(`
			`domain: parts[0],`
			`httpOnly: "true".casecmp?(parts[1]),`
			`path: parts[2],`
			`secure: "true".casecmp?(parts[3]),`
			`expires: parts[4] == "0" ? nil : DateTime.strptime(parts[4], "%s"),`
			`name: parts[5],`
			`value: parts[6]`
			`)`
			`end`
Improve Google Groups scraper * Better error detection during login phase * Experimental support for 2FA and SMS codes * Detect missing permissions to scrape email addresses 2019-03-24 18:08:03 -04:00			`end`

			`def wait_for_url`
			`wait = Selenium::WebDriver::Wait.new(timeout: 5)`
			`wait.until { yield(driver.current_url) }`
			`end`

Make Google Groups scraper work with latest chromedriver 2019-03-25 11:10:37 -04:00			`def exit_with_error(*messages)`
			`STDERR.puts messages`
Improve Google Groups scraper * Better error detection during login phase * Experimental support for 2FA and SMS codes * Detect missing permissions to scrape email addresses 2019-03-24 18:08:03 -04:00			`exit 1`
FEATURE: Add download script for Google Groups 2018-10-30 19:31:20 -04:00			`end`

			`def crawl`
			`start_time = Time.now`
			`status_filename = File.join(@path, "status.yml")`
			`@scraped_topic_urls = File.exists?(status_filename) ? YAML.load_file(status_filename) : Set.new`

			`login`

			`begin`
			`crawl_categories`
			`ensure`
			`File.write(status_filename, @scraped_topic_urls.to_yaml)`
			`end`

			`elapsed = Time.now - start_time`
			`puts "", "", "Done (%02dh %02dmin %02dsec)" % [elapsed / 3600, elapsed / 60 % 60, elapsed % 60]`
			`end`

			`def parse_arguments`
			`puts ""`

FIX: By default, don't abort Google Groups crawling on error 2019-09-18 12:11:52 -04:00			`# default values`
Improve Google Groups scraper * Better error detection during login phase * Experimental support for 2FA and SMS codes * Detect missing permissions to scrape email addresses 2019-03-24 18:08:03 -04:00			`@force_import = false`
FIX: By default, don't abort Google Groups crawling on error 2019-09-18 12:11:52 -04:00			`@abort_on_error = false`
			`@cookies = DEFAULT_COOKIES_TXT if File.exist?(DEFAULT_COOKIES_TXT)`
Improve Google Groups scraper * Better error detection during login phase * Experimental support for 2FA and SMS codes * Detect missing permissions to scrape email addresses 2019-03-24 18:08:03 -04:00
FEATURE: Add download script for Google Groups 2018-10-30 19:31:20 -04:00			`parser = OptionParser.new do \|opts\|`
			`opts.banner = "Usage: google_groups.rb [options]"`

			`opts.on("-g", "--groupname GROUPNAME") { \|v\| @groupname = v }`
FIX: Google Groups crawler failed to login Trying to automate the login into a Google account is quite hard. This makes the crawler use the content of a cookies.txt file instead. It also removes a couple of deprecation warnings and adds some color to the output. 2019-09-18 07:07:07 -04:00			`opts.on("-c", "--cookies PATH", "path to cookies.txt") { \|v\| @cookies = v }`
FEATURE: Add download script for Google Groups 2018-10-30 19:31:20 -04:00			`opts.on("--path PATH", "output path for emails") { \|v\| @path = v }`
Improve Google Groups scraper * Better error detection during login phase * Experimental support for 2FA and SMS codes * Detect missing permissions to scrape email addresses 2019-03-24 18:08:03 -04:00			`opts.on("-f", "--force", "force import when user isn't allowed to see email addresses") { @force_import = true }`
FIX: By default, don't abort Google Groups crawling on error 2019-09-18 12:11:52 -04:00			`opts.on("-a", "--abort-on-error", "abort crawl on error instead of skipping message") { @abort_on_error = true }`
FEATURE: Add download script for Google Groups 2018-10-30 19:31:20 -04:00			`opts.on("-h", "--help") do`
			`puts opts`
			`exit`
			`end`
			`end`

			`begin`
			`parser.parse!`
			`rescue OptionParser::ParseError => e`
Make Google Groups scraper work with latest chromedriver 2019-03-25 11:10:37 -04:00			`exit_with_error(e.message, "", parser)`
FEATURE: Add download script for Google Groups 2018-10-30 19:31:20 -04:00			`end`

FIX: Google Groups crawler failed to login Trying to automate the login into a Google account is quite hard. This makes the crawler use the content of a cookies.txt file instead. It also removes a couple of deprecation warnings and adds some color to the output. 2019-09-18 07:07:07 -04:00			`mandatory = [:groupname, :cookies]`
FEATURE: Add download script for Google Groups 2018-10-30 19:31:20 -04:00			`missing = mandatory.select { \|name\| instance_variable_get("@#{name}").nil? }`

FIX: Google Groups crawler failed to login Trying to automate the login into a Google account is quite hard. This makes the crawler use the content of a cookies.txt file instead. It also removes a couple of deprecation warnings and adds some color to the output. 2019-09-18 07:07:07 -04:00			`exit_with_error("Missing arguments: #{missing.join(', ')}".red.bold, "", parser, "") if missing.any?`
			`exit_with_error("cookies.txt not found at #{@cookies}".red.bold, "") if !File.exist?(@cookies)`
FEATURE: Add download script for Google Groups 2018-10-30 19:31:20 -04:00
FIX: By default, don't abort Google Groups crawling on error 2019-09-18 12:11:52 -04:00			`@path = File.join(DEFAULT_OUTPUT_PATH, @groupname) if @path.nil?`
FEATURE: Add download script for Google Groups 2018-10-30 19:31:20 -04:00			`FileUtils.mkpath(@path)`
			`end`

			`parse_arguments`
			`crawl`