discourse/script/import_scripts/google_groups.rb

#!/usr/bin/env ruby
# frozen_string_literal: true

require "bundler/inline"

gemfile(true) do
  source "https://rubygems.org"

  gem "webdrivers"
  gem "colored2"
end

require "fileutils"
require "optparse"
require "set"
require "yaml"

DEFAULT_OUTPUT_PATH = "/shared/import/data"
DEFAULT_COOKIES_TXT = "/shared/import/cookies.txt"
ABORT_AFTER_SKIPPED_TOPIC_COUNT = 10

def driver
  @driver ||= begin
    chrome_args = ["disable-gpu"]
    chrome_args << "headless" unless ENV["NOT_HEADLESS"] == '1'
    chrome_args << "no-sandbox" if inside_container?
    options = Selenium::WebDriver::Chrome::Options.new(args: chrome_args)
    Selenium::WebDriver.for(:chrome, options: options)
  end
end

def inside_container?
  File.foreach("/proc/1/cgroup") do |line|
    return true if line.include?("docker")
  end

  false
end

MAX_GET_RETRIES = 5
MAX_FIND_RETRIES = 3

def get(url)
  begin
    retries ||= 0
    driver.get(url)
  rescue Net::ReadTimeout
    sleep retries
    retry if (retries += 1) < MAX_GET_RETRIES
  end
end

def extract(css, parent_element = driver)
  begin
    retries ||= 0
    parent_element.find_elements(css: css).map { |element| yield(element) }
  rescue Net::ReadTimeout, Selenium::WebDriver::Error::StaleElementReferenceError
    sleep retries
    retry if (retries += 1) < MAX_FIND_RETRIES
  end
end

def find(css, parent_element = driver)
  begin
    retries ||= 0
    parent_element.find_element(css: css)
  rescue Net::ReadTimeout, Selenium::WebDriver::Error::ElementNotInteractableError
    sleep retries
    retry if (retries += 1) < MAX_FIND_RETRIES
  end
end

def base_url
  if @domain.nil?
    "https://groups.google.com/forum/?_escaped_fragment_=categories"
  else
    "https://groups.google.com/a/#{@domain}/forum/?_escaped_fragment_=categories"
  end
end

def crawl_topics
  1.step(nil, 100).each do |start|
    url = "#{base_url}/#{@groupname}[#{start}-#{start + 99}]"
    get(url)

    begin
      if start == 1 && find("h2").text == "Error 403"
        exit_with_error(<<~MSG.red.bold)
          Unable to find topics. Try running the script with the "--domain example.com"
          option if you are a G Suite user and your group's URL contains a path with
          your domain that looks like "/a/example.com".
        MSG
      end
    rescue Selenium::WebDriver::Error::NoSuchElementError
      # Ignore this error. It simply means there wasn't an error.
    end

    topic_urls = extract(".subject a[href*='#{@groupname}']") { |a| a["href"].sub("/d/topic/", "/forum/?_escaped_fragment_=topic/") }
    break if topic_urls.size == 0

    topic_urls.each do |topic_url|
      crawl_topic(topic_url)

      # abort if this in an incremental crawl and there were too many consecutive, skipped topics
      if @finished && @skipped_topic_count > ABORT_AFTER_SKIPPED_TOPIC_COUNT
        puts "Skipping all other topics, because this is an incremental crawl.".green
        return
      end
    end
  end
end

def crawl_topic(url)
  skippable = @scraped_topic_urls.include?(url)

  # Skip this topic if there were already too many consecutive, skipped topics.
  # Otherwise we have to look if there are new messages in the topic.
  if skippable && @skipped_topic_count > ABORT_AFTER_SKIPPED_TOPIC_COUNT
    puts "Skipping".green << " #{url}"
    return
  end

  puts "Scraping #{url}"
  get(url)

  messsages_crawled = false

  extract(".subject a[href*='#{@groupname}']") do |a|
    [
      a["href"].sub("/d/msg/", "/forum/message/raw?msg="),
      a["title"].empty?
    ]
  end.each do |msg_url, might_be_deleted|
    messsages_crawled |= crawl_message(msg_url, might_be_deleted)
  end

  @skipped_topic_count = skippable && messsages_crawled ? 0 : @skipped_topic_count + 1
  @scraped_topic_urls << url
rescue
  puts "Failed to scrape topic at #{url}".red
  raise if @abort_on_error
end

def crawl_message(url, might_be_deleted)
  get(url)

  filename = File.join(@path, "#{url[/#{@groupname}\/(.+)/, 1].sub("/", "-")}.eml")
  content = find("pre")["innerText"]

  if !@first_message_checked
    @first_message_checked = true

    if content.match?(/From:.*\.\.\.@.*/i) && !@force_import
      exit_with_error(<<~MSG.red.bold)
        It looks like you do not have permissions to see email addresses. Aborting.
        Use the --force option to import anyway.
      MSG
    end
  end

  old_md5 = Digest::MD5.file(filename) if File.exist?(filename)
  File.write(filename, content)

  old_md5 ? old_md5 != Digest::MD5.file(filename) : true
rescue Selenium::WebDriver::Error::NoSuchElementError
  if might_be_deleted
    puts "Message might be deleted. Skipping #{url}"
  else
    puts "Failed to scrape message at #{url}".red
    raise if @abort_on_error
  end
rescue
  puts "Failed to scrape message at #{url}".red
  raise if @abort_on_error
end

def login
  puts "Logging in..."
  get("https://google.com/404")

  add_cookies(
    "myaccount.google.com",
    "google.com"
  )

  get("https://myaccount.google.com/?utm_source=sign_in_no_continue")

  begin
    wait_for_url { |url| url.start_with?("https://accounts.google.com") }
  rescue Selenium::WebDriver::Error::TimeoutError
    exit_with_error("Failed to login. Please check the content of your cookies.txt".red.bold)
  end
end

def add_cookies(*domains)
  File.readlines(@cookies).each do |line|
    parts = line.chomp.split("\t")
    next if parts.size != 7 || !domains.any? { |domain| parts[0] =~ /^\.?#{Regexp.escape(domain)}$/ }

    driver.manage.add_cookie(
      domain: parts[0],
      httpOnly: "true".casecmp?(parts[1]),
      path: parts[2],
      secure: "true".casecmp?(parts[3]),
      expires: parts[4] == "0" ? nil : DateTime.strptime(parts[4], "%s"),
      name: parts[5],
      value: parts[6]
    )
  end
end

def wait_for_url
  wait = Selenium::WebDriver::Wait.new(timeout: 5)
  wait.until { yield(driver.current_url) }
end

def exit_with_error(*messages)
  STDERR.puts messages
  exit 1
end

def crawl
  start_time = Time.now
  status_filename = File.join(@path, "status.yml")

  if File.exists?(status_filename)
    yaml = YAML.load_file(status_filename)
    @finished = yaml[:finished]
    @scraped_topic_urls = yaml[:urls]
  else
    @finished = false
    @scraped_topic_urls = Set.new
  end

  @skipped_topic_count = 0

  login

  begin
    crawl_topics
    @finished = true
  ensure
    File.write(status_filename, {
      finished: @finished,
      urls: @scraped_topic_urls
    }.to_yaml)
  end

  elapsed = Time.now - start_time
  puts "", "", "Done (%02dh %02dmin %02dsec)" % [elapsed / 3600, elapsed / 60 % 60, elapsed % 60]
end

def parse_arguments
  puts ""

  # default values
  @force_import = false
  @abort_on_error = false
  @cookies = DEFAULT_COOKIES_TXT if File.exist?(DEFAULT_COOKIES_TXT)

  parser = OptionParser.new do |opts|
    opts.banner = "Usage: google_groups.rb [options]"

    opts.on("-g", "--groupname GROUPNAME") { |v| @groupname = v }
    opts.on("-d", "--domain DOMAIN") { |v| @domain = v }
    opts.on("-c", "--cookies PATH", "path to cookies.txt") { |v| @cookies = v }
    opts.on("--path PATH", "output path for emails") { |v| @path = v }
    opts.on("-f", "--force", "force import when user isn't allowed to see email addresses") { @force_import = true }
    opts.on("-a", "--abort-on-error", "abort crawl on error instead of skipping message") { @abort_on_error = true }
    opts.on("-h", "--help") do
      puts opts
      exit
    end
  end

  begin
    parser.parse!
  rescue OptionParser::ParseError => e
    exit_with_error(e.message, "", parser)
  end

  mandatory = [:groupname, :cookies]
  missing = mandatory.select { |name| instance_variable_get("@#{name}").nil? }

  exit_with_error("Missing arguments: #{missing.join(', ')}".red.bold, "", parser, "") if missing.any?
  exit_with_error("cookies.txt not found at #{@cookies}".red.bold, "") if !File.exist?(@cookies)

  @path = File.join(DEFAULT_OUTPUT_PATH, @groupname) if @path.nil?
  FileUtils.mkpath(@path)
end

parse_arguments
crawl
FEATURE: Add download script for Google Groups 2018-10-30 19:31:20 -04:00			`#!/usr/bin/env ruby`
DEV: enable frozen string literal on all files This reduces chances of errors where consumers of strings mutate inputs and reduces memory usage of the app. Test suite passes now, but there may be some stuff left, so we will run a few sites on a branch prior to merging 2019-05-02 18:17:27 -04:00			`# frozen_string_literal: true`
FEATURE: Add download script for Google Groups 2018-10-30 19:31:20 -04:00
			`require "bundler/inline"`

			`gemfile(true) do`
			`source "https://rubygems.org"`

Make Google Groups scraper work with latest chromedriver 2019-03-25 11:10:37 -04:00			`gem "webdrivers"`
FIX: Google Groups crawler failed to login Trying to automate the login into a Google account is quite hard. This makes the crawler use the content of a cookies.txt file instead. It also removes a couple of deprecation warnings and adds some color to the output. 2019-09-18 07:07:07 -04:00			`gem "colored2"`
FEATURE: Add download script for Google Groups 2018-10-30 19:31:20 -04:00			`end`

			`require "fileutils"`
			`require "optparse"`
			`require "set"`
			`require "yaml"`

			`DEFAULT_OUTPUT_PATH = "/shared/import/data"`
FIX: Google Groups crawler failed to login Trying to automate the login into a Google account is quite hard. This makes the crawler use the content of a cookies.txt file instead. It also removes a couple of deprecation warnings and adds some color to the output. 2019-09-18 07:07:07 -04:00			`DEFAULT_COOKIES_TXT = "/shared/import/cookies.txt"`
DEV: Better handling of incremental scrapes for Google Groups 2020-03-13 18:59:40 -04:00			`ABORT_AFTER_SKIPPED_TOPIC_COUNT = 10`
FEATURE: Add download script for Google Groups 2018-10-30 19:31:20 -04:00
			`def driver`
			`@driver \|\|= begin`
FIX: Google Groups crawler failed to login Trying to automate the login into a Google account is quite hard. This makes the crawler use the content of a cookies.txt file instead. It also removes a couple of deprecation warnings and adds some color to the output. 2019-09-18 07:07:07 -04:00			`chrome_args = ["disable-gpu"]`
			`chrome_args << "headless" unless ENV["NOT_HEADLESS"] == '1'`
Make Google Groups scraper work with latest chromedriver 2019-03-25 11:10:37 -04:00			`chrome_args << "no-sandbox" if inside_container?`
FEATURE: Add download script for Google Groups 2018-10-30 19:31:20 -04:00			`options = Selenium::WebDriver::Chrome::Options.new(args: chrome_args)`
FIX: Latest Selenium gem broke Google Groups import script Selenium uses Keep-Alive since version 3.141, so the net-http-persistent gem shouldn't be needed anymore. 2019-07-10 03:45:25 -04:00			`Selenium::WebDriver.for(:chrome, options: options)`
FEATURE: Add download script for Google Groups 2018-10-30 19:31:20 -04:00			`end`
			`end`

			`def inside_container?`
			`File.foreach("/proc/1/cgroup") do \|line\|`
			`return true if line.include?("docker")`
			`end`

			`false`
			`end`

			`MAX_GET_RETRIES = 5`
			`MAX_FIND_RETRIES = 3`

			`def get(url)`
			`begin`
			`retries \|\|= 0`
			`driver.get(url)`
			`rescue Net::ReadTimeout`
			`sleep retries`
			`retry if (retries += 1) < MAX_GET_RETRIES`
			`end`
			`end`

Make Rubocop happy 2018-10-30 20:30:14 -04:00			`def extract(css, parent_element = driver)`
FEATURE: Add download script for Google Groups 2018-10-30 19:31:20 -04:00			`begin`
			`retries \|\|= 0`
Make Rubocop happy 2018-10-30 20:30:14 -04:00			`parent_element.find_elements(css: css).map { \|element\| yield(element) }`
FEATURE: Add download script for Google Groups 2018-10-30 19:31:20 -04:00			`rescue Net::ReadTimeout, Selenium::WebDriver::Error::StaleElementReferenceError`
			`sleep retries`
			`retry if (retries += 1) < MAX_FIND_RETRIES`
			`end`
			`end`

Make Rubocop happy 2018-10-30 20:30:14 -04:00			`def find(css, parent_element = driver)`
FEATURE: Add download script for Google Groups 2018-10-30 19:31:20 -04:00			`begin`
			`retries \|\|= 0`
Make Rubocop happy 2018-10-30 20:30:14 -04:00			`parent_element.find_element(css: css)`
FIX: Google Groups crawler failed to login Trying to automate the login into a Google account is quite hard. This makes the crawler use the content of a cookies.txt file instead. It also removes a couple of deprecation warnings and adds some color to the output. 2019-09-18 07:07:07 -04:00			`rescue Net::ReadTimeout, Selenium::WebDriver::Error::ElementNotInteractableError`
FEATURE: Add download script for Google Groups 2018-10-30 19:31:20 -04:00			`sleep retries`
			`retry if (retries += 1) < MAX_FIND_RETRIES`
			`end`
			`end`

FIX: Make Google Groups scraper work for G Suite users 2019-11-27 20:09:05 -05:00			`def base_url`
			`if @domain.nil?`
			`"https://groups.google.com/forum/?_escaped_fragment_=categories"`
			`else`
			`"https://groups.google.com/a/#{@domain}/forum/?_escaped_fragment_=categories"`
			`end`
			`end`

DEV: Better handling of incremental scrapes for Google Groups 2020-03-13 18:59:40 -04:00			`def crawl_topics`
FEATURE: Add download script for Google Groups 2018-10-30 19:31:20 -04:00			`1.step(nil, 100).each do \|start\|`
FIX: Make Google Groups scraper work for G Suite users 2019-11-27 20:09:05 -05:00			`url = "#{base_url}/#{@groupname}[#{start}-#{start + 99}]"`
FEATURE: Add download script for Google Groups 2018-10-30 19:31:20 -04:00			`get(url)`

FIX: Make Google Groups scraper work for G Suite users 2019-11-27 20:09:05 -05:00			`begin`
			`if start == 1 && find("h2").text == "Error 403"`
			`exit_with_error(<<~MSG.red.bold)`
			`Unable to find topics. Try running the script with the "--domain example.com"`
			`option if you are a G Suite user and your group's URL contains a path with`
			`your domain that looks like "/a/example.com".`
			`MSG`
			`end`
			`rescue Selenium::WebDriver::Error::NoSuchElementError`
			`# Ignore this error. It simply means there wasn't an error.`
			`end`

Make Rubocop happy 2018-10-30 20:30:14 -04:00			`topic_urls = extract(".subject a[href*='#{@groupname}']") { \|a\| a["href"].sub("/d/topic/", "/forum/?_escaped_fragment_=topic/") }`
			`break if topic_urls.size == 0`
FEATURE: Add download script for Google Groups 2018-10-30 19:31:20 -04:00
DEV: Better handling of incremental scrapes for Google Groups 2020-03-13 18:59:40 -04:00			`topic_urls.each do \|topic_url\|`
			`crawl_topic(topic_url)`

			`# abort if this in an incremental crawl and there were too many consecutive, skipped topics`
			`if @finished && @skipped_topic_count > ABORT_AFTER_SKIPPED_TOPIC_COUNT`
			`puts "Skipping all other topics, because this is an incremental crawl.".green`
			`return`
			`end`
			`end`
FEATURE: Add download script for Google Groups 2018-10-30 19:31:20 -04:00			`end`
			`end`

			`def crawl_topic(url)`
DEV: Better handling of incremental scrapes for Google Groups 2020-03-13 18:59:40 -04:00			`skippable = @scraped_topic_urls.include?(url)`

			`# Skip this topic if there were already too many consecutive, skipped topics.`
			`# Otherwise we have to look if there are new messages in the topic.`
			`if skippable && @skipped_topic_count > ABORT_AFTER_SKIPPED_TOPIC_COUNT`
FIX: Google Groups crawler failed to login Trying to automate the login into a Google account is quite hard. This makes the crawler use the content of a cookies.txt file instead. It also removes a couple of deprecation warnings and adds some color to the output. 2019-09-18 07:07:07 -04:00			`puts "Skipping".green << " #{url}"`
FEATURE: Add download script for Google Groups 2018-10-30 19:31:20 -04:00			`return`
			`end`

			`puts "Scraping #{url}"`
			`get(url)`

DEV: Better handling of incremental scrapes for Google Groups 2020-03-13 18:59:40 -04:00			`messsages_crawled = false`

FEATURE: Add download script for Google Groups 2018-10-30 19:31:20 -04:00			`extract(".subject a[href*='#{@groupname}']") do \|a\|`
			`[`
			`a["href"].sub("/d/msg/", "/forum/message/raw?msg="),`
			`a["title"].empty?`
			`]`
DEV: Better handling of incremental scrapes for Google Groups 2020-03-13 18:59:40 -04:00			`end.each do \|msg_url, might_be_deleted\|`
			`messsages_crawled \|= crawl_message(msg_url, might_be_deleted)`
			`end`
FEATURE: Add download script for Google Groups 2018-10-30 19:31:20 -04:00
DEV: Better handling of incremental scrapes for Google Groups 2020-03-13 18:59:40 -04:00			`@skipped_topic_count = skippable && messsages_crawled ? 0 : @skipped_topic_count + 1`
FEATURE: Add download script for Google Groups 2018-10-30 19:31:20 -04:00			`@scraped_topic_urls << url`
			`rescue`
FIX: Google Groups crawler failed to login Trying to automate the login into a Google account is quite hard. This makes the crawler use the content of a cookies.txt file instead. It also removes a couple of deprecation warnings and adds some color to the output. 2019-09-18 07:07:07 -04:00			`puts "Failed to scrape topic at #{url}".red`
FIX: By default, don't abort Google Groups crawling on error 2019-09-18 12:11:52 -04:00			`raise if @abort_on_error`
FEATURE: Add download script for Google Groups 2018-10-30 19:31:20 -04:00			`end`

			`def crawl_message(url, might_be_deleted)`
			`get(url)`

			`filename = File.join(@path, "#{url[/#{@groupname}\/(.+)/, 1].sub("/", "-")}.eml")`
			`content = find("pre")["innerText"]`

Improve Google Groups scraper * Better error detection during login phase * Experimental support for 2FA and SMS codes * Detect missing permissions to scrape email addresses 2019-03-24 18:08:03 -04:00			`if !@first_message_checked`
			`@first_message_checked = true`

			`if content.match?(/From:.\.\.\.@./i) && !@force_import`
FIX: Google Groups crawler failed to login Trying to automate the login into a Google account is quite hard. This makes the crawler use the content of a cookies.txt file instead. It also removes a couple of deprecation warnings and adds some color to the output. 2019-09-18 07:07:07 -04:00			`exit_with_error(<<~MSG.red.bold)`
Improve Google Groups scraper * Better error detection during login phase * Experimental support for 2FA and SMS codes * Detect missing permissions to scrape email addresses 2019-03-24 18:08:03 -04:00			`It looks like you do not have permissions to see email addresses. Aborting.`
			`Use the --force option to import anyway.`
			`MSG`
			`end`
			`end`

DEV: Better handling of incremental scrapes for Google Groups 2020-03-13 18:59:40 -04:00			`old_md5 = Digest::MD5.file(filename) if File.exist?(filename)`
FEATURE: Add download script for Google Groups 2018-10-30 19:31:20 -04:00			`File.write(filename, content)`
DEV: Better handling of incremental scrapes for Google Groups 2020-03-13 18:59:40 -04:00
			`old_md5 ? old_md5 != Digest::MD5.file(filename) : true`
FEATURE: Add download script for Google Groups 2018-10-30 19:31:20 -04:00			`rescue Selenium::WebDriver::Error::NoSuchElementError`
FIX: By default, don't abort Google Groups crawling on error 2019-09-18 12:11:52 -04:00			`if might_be_deleted`
			`puts "Message might be deleted. Skipping #{url}"`
			`else`
			`puts "Failed to scrape message at #{url}".red`
			`raise if @abort_on_error`
			`end`
FEATURE: Add download script for Google Groups 2018-10-30 19:31:20 -04:00			`rescue`
FIX: Google Groups crawler failed to login Trying to automate the login into a Google account is quite hard. This makes the crawler use the content of a cookies.txt file instead. It also removes a couple of deprecation warnings and adds some color to the output. 2019-09-18 07:07:07 -04:00			`puts "Failed to scrape message at #{url}".red`
FIX: By default, don't abort Google Groups crawling on error 2019-09-18 12:11:52 -04:00			`raise if @abort_on_error`
FEATURE: Add download script for Google Groups 2018-10-30 19:31:20 -04:00			`end`

			`def login`
			`puts "Logging in..."`
FIX: Google Groups crawler failed to login Trying to automate the login into a Google account is quite hard. This makes the crawler use the content of a cookies.txt file instead. It also removes a couple of deprecation warnings and adds some color to the output. 2019-09-18 07:07:07 -04:00			`get("https://google.com/404")`

			`add_cookies(`
			`"myaccount.google.com",`
			`"google.com"`
			`)`

FIX: Google Groups scraper failed to login 2020-03-02 11:24:48 -05:00			`get("https://myaccount.google.com/?utm_source=sign_in_no_continue")`
FIX: Google Groups crawler failed to login Trying to automate the login into a Google account is quite hard. This makes the crawler use the content of a cookies.txt file instead. It also removes a couple of deprecation warnings and adds some color to the output. 2019-09-18 07:07:07 -04:00
			`begin`
FIX: Google groups import changed login URL (#9432) I'm not clear why changing only the `wait_for_url` address was necessary and not also the `get` a few lines above, but this change seems to work for me on both literatecomputing.com Groups and a public group. 2020-04-15 16:45:14 -04:00			`wait_for_url { \|url\| url.start_with?("https://accounts.google.com") }`
FIX: Google Groups crawler failed to login Trying to automate the login into a Google account is quite hard. This makes the crawler use the content of a cookies.txt file instead. It also removes a couple of deprecation warnings and adds some color to the output. 2019-09-18 07:07:07 -04:00			`rescue Selenium::WebDriver::Error::TimeoutError`
			`exit_with_error("Failed to login. Please check the content of your cookies.txt".red.bold)`
Improve Google Groups scraper * Better error detection during login phase * Experimental support for 2FA and SMS codes * Detect missing permissions to scrape email addresses 2019-03-24 18:08:03 -04:00			`end`
FIX: Google Groups crawler failed to login Trying to automate the login into a Google account is quite hard. This makes the crawler use the content of a cookies.txt file instead. It also removes a couple of deprecation warnings and adds some color to the output. 2019-09-18 07:07:07 -04:00			`end`
Improve Google Groups scraper * Better error detection during login phase * Experimental support for 2FA and SMS codes * Detect missing permissions to scrape email addresses 2019-03-24 18:08:03 -04:00
FIX: Google Groups crawler failed to login Trying to automate the login into a Google account is quite hard. This makes the crawler use the content of a cookies.txt file instead. It also removes a couple of deprecation warnings and adds some color to the output. 2019-09-18 07:07:07 -04:00			`def add_cookies(*domains)`
			`File.readlines(@cookies).each do \|line\|`
			`parts = line.chomp.split("\t")`
			`next if parts.size != 7 \|\| !domains.any? { \|domain\| parts[0] =~ /^\.?#{Regexp.escape(domain)}$/ }`

			`driver.manage.add_cookie(`
			`domain: parts[0],`
			`httpOnly: "true".casecmp?(parts[1]),`
			`path: parts[2],`
			`secure: "true".casecmp?(parts[3]),`
			`expires: parts[4] == "0" ? nil : DateTime.strptime(parts[4], "%s"),`
			`name: parts[5],`
			`value: parts[6]`
			`)`
			`end`
Improve Google Groups scraper * Better error detection during login phase * Experimental support for 2FA and SMS codes * Detect missing permissions to scrape email addresses 2019-03-24 18:08:03 -04:00			`end`

			`def wait_for_url`
			`wait = Selenium::WebDriver::Wait.new(timeout: 5)`
			`wait.until { yield(driver.current_url) }`
			`end`

Make Google Groups scraper work with latest chromedriver 2019-03-25 11:10:37 -04:00			`def exit_with_error(*messages)`
			`STDERR.puts messages`
Improve Google Groups scraper * Better error detection during login phase * Experimental support for 2FA and SMS codes * Detect missing permissions to scrape email addresses 2019-03-24 18:08:03 -04:00			`exit 1`
FEATURE: Add download script for Google Groups 2018-10-30 19:31:20 -04:00			`end`

			`def crawl`
			`start_time = Time.now`
			`status_filename = File.join(@path, "status.yml")`
DEV: Better handling of incremental scrapes for Google Groups 2020-03-13 18:59:40 -04:00
			`if File.exists?(status_filename)`
			`yaml = YAML.load_file(status_filename)`
			`@finished = yaml[:finished]`
			`@scraped_topic_urls = yaml[:urls]`
			`else`
			`@finished = false`
			`@scraped_topic_urls = Set.new`
			`end`

			`@skipped_topic_count = 0`
FEATURE: Add download script for Google Groups 2018-10-30 19:31:20 -04:00
			`login`

			`begin`
DEV: Better handling of incremental scrapes for Google Groups 2020-03-13 18:59:40 -04:00			`crawl_topics`
			`@finished = true`
FEATURE: Add download script for Google Groups 2018-10-30 19:31:20 -04:00			`ensure`
DEV: Better handling of incremental scrapes for Google Groups 2020-03-13 18:59:40 -04:00			`File.write(status_filename, {`
			`finished: @finished,`
			`urls: @scraped_topic_urls`
			`}.to_yaml)`
FEATURE: Add download script for Google Groups 2018-10-30 19:31:20 -04:00			`end`

			`elapsed = Time.now - start_time`
			`puts "", "", "Done (%02dh %02dmin %02dsec)" % [elapsed / 3600, elapsed / 60 % 60, elapsed % 60]`
			`end`

			`def parse_arguments`
			`puts ""`

FIX: By default, don't abort Google Groups crawling on error 2019-09-18 12:11:52 -04:00			`# default values`
Improve Google Groups scraper * Better error detection during login phase * Experimental support for 2FA and SMS codes * Detect missing permissions to scrape email addresses 2019-03-24 18:08:03 -04:00			`@force_import = false`
FIX: By default, don't abort Google Groups crawling on error 2019-09-18 12:11:52 -04:00			`@abort_on_error = false`
			`@cookies = DEFAULT_COOKIES_TXT if File.exist?(DEFAULT_COOKIES_TXT)`
Improve Google Groups scraper * Better error detection during login phase * Experimental support for 2FA and SMS codes * Detect missing permissions to scrape email addresses 2019-03-24 18:08:03 -04:00
FEATURE: Add download script for Google Groups 2018-10-30 19:31:20 -04:00			`parser = OptionParser.new do \|opts\|`
			`opts.banner = "Usage: google_groups.rb [options]"`

			`opts.on("-g", "--groupname GROUPNAME") { \|v\| @groupname = v }`
FIX: Make Google Groups scraper work for G Suite users 2019-11-27 20:09:05 -05:00			`opts.on("-d", "--domain DOMAIN") { \|v\| @domain = v }`
FIX: Google Groups crawler failed to login Trying to automate the login into a Google account is quite hard. This makes the crawler use the content of a cookies.txt file instead. It also removes a couple of deprecation warnings and adds some color to the output. 2019-09-18 07:07:07 -04:00			`opts.on("-c", "--cookies PATH", "path to cookies.txt") { \|v\| @cookies = v }`
FEATURE: Add download script for Google Groups 2018-10-30 19:31:20 -04:00			`opts.on("--path PATH", "output path for emails") { \|v\| @path = v }`
Improve Google Groups scraper * Better error detection during login phase * Experimental support for 2FA and SMS codes * Detect missing permissions to scrape email addresses 2019-03-24 18:08:03 -04:00			`opts.on("-f", "--force", "force import when user isn't allowed to see email addresses") { @force_import = true }`
FIX: By default, don't abort Google Groups crawling on error 2019-09-18 12:11:52 -04:00			`opts.on("-a", "--abort-on-error", "abort crawl on error instead of skipping message") { @abort_on_error = true }`
FEATURE: Add download script for Google Groups 2018-10-30 19:31:20 -04:00			`opts.on("-h", "--help") do`
			`puts opts`
			`exit`
			`end`
			`end`

			`begin`
			`parser.parse!`
			`rescue OptionParser::ParseError => e`
Make Google Groups scraper work with latest chromedriver 2019-03-25 11:10:37 -04:00			`exit_with_error(e.message, "", parser)`
FEATURE: Add download script for Google Groups 2018-10-30 19:31:20 -04:00			`end`

FIX: Google Groups crawler failed to login Trying to automate the login into a Google account is quite hard. This makes the crawler use the content of a cookies.txt file instead. It also removes a couple of deprecation warnings and adds some color to the output. 2019-09-18 07:07:07 -04:00			`mandatory = [:groupname, :cookies]`
FEATURE: Add download script for Google Groups 2018-10-30 19:31:20 -04:00			`missing = mandatory.select { \|name\| instance_variable_get("@#{name}").nil? }`

FIX: Google Groups crawler failed to login Trying to automate the login into a Google account is quite hard. This makes the crawler use the content of a cookies.txt file instead. It also removes a couple of deprecation warnings and adds some color to the output. 2019-09-18 07:07:07 -04:00			`exit_with_error("Missing arguments: #{missing.join(', ')}".red.bold, "", parser, "") if missing.any?`
			`exit_with_error("cookies.txt not found at #{@cookies}".red.bold, "") if !File.exist?(@cookies)`
FEATURE: Add download script for Google Groups 2018-10-30 19:31:20 -04:00
FIX: By default, don't abort Google Groups crawling on error 2019-09-18 12:11:52 -04:00			`@path = File.join(DEFAULT_OUTPUT_PATH, @groupname) if @path.nil?`
FEATURE: Add download script for Google Groups 2018-10-30 19:31:20 -04:00			`FileUtils.mkpath(@path)`
			`end`

			`parse_arguments`
			`crawl`