#!/usr/bin/env ruby # frozen_string_literal: true require "bundler/inline" gemfile(true) do source "https://rubygems.org" gem "webdrivers" gem "colored2" end require "fileutils" require "optparse" require "set" require "yaml" DEFAULT_OUTPUT_PATH = "/shared/import/data" DEFAULT_COOKIES_TXT = "/shared/import/cookies.txt" ABORT_AFTER_SKIPPED_TOPIC_COUNT = 10 def driver @driver ||= begin chrome_args = ["disable-gpu"] chrome_args << "headless" unless ENV["NOT_HEADLESS"] == '1' chrome_args << "no-sandbox" if inside_container? options = Selenium::WebDriver::Chrome::Options.new(args: chrome_args) Selenium::WebDriver.for(:chrome, options: options) end end def inside_container? File.foreach("/proc/1/cgroup") do |line| return true if line.include?("docker") end false end MAX_GET_RETRIES = 5 MAX_FIND_RETRIES = 3 def get(url) begin retries ||= 0 driver.get(url) rescue Net::ReadTimeout sleep retries retry if (retries += 1) < MAX_GET_RETRIES end end def extract(css, parent_element = driver) begin retries ||= 0 parent_element.find_elements(css: css).map { |element| yield(element) } rescue Net::ReadTimeout, Selenium::WebDriver::Error::StaleElementReferenceError sleep retries retry if (retries += 1) < MAX_FIND_RETRIES end end def find(css, parent_element = driver) begin retries ||= 0 parent_element.find_element(css: css) rescue Net::ReadTimeout, Selenium::WebDriver::Error::ElementNotInteractableError sleep retries retry if (retries += 1) < MAX_FIND_RETRIES end end def base_url if @domain.nil? "https://groups.google.com/forum/?_escaped_fragment_=categories" else "https://groups.google.com/a/#{@domain}/forum/?_escaped_fragment_=categories" end end def crawl_topics 1.step(nil, 100).each do |start| url = "#{base_url}/#{@groupname}[#{start}-#{start + 99}]" get(url) begin if start == 1 && find("h2").text == "Error 403" exit_with_error(<<~MSG.red.bold) Unable to find topics. Try running the script with the "--domain example.com" option if you are a G Suite user and your group's URL contains a path with your domain that looks like "/a/example.com". MSG end rescue Selenium::WebDriver::Error::NoSuchElementError # Ignore this error. It simply means there wasn't an error. end topic_urls = extract(".subject a[href*='#{@groupname}']") { |a| a["href"].sub("/d/topic/", "/forum/?_escaped_fragment_=topic/") } break if topic_urls.size == 0 topic_urls.each do |topic_url| crawl_topic(topic_url) # abort if this in an incremental crawl and there were too many consecutive, skipped topics if @finished && @skipped_topic_count > ABORT_AFTER_SKIPPED_TOPIC_COUNT puts "Skipping all other topics, because this is an incremental crawl.".green return end end end end def crawl_topic(url) skippable = @scraped_topic_urls.include?(url) # Skip this topic if there were already too many consecutive, skipped topics. # Otherwise we have to look if there are new messages in the topic. if skippable && @skipped_topic_count > ABORT_AFTER_SKIPPED_TOPIC_COUNT puts "Skipping".green << " #{url}" return end puts "Scraping #{url}" get(url) messsages_crawled = false extract(".subject a[href*='#{@groupname}']") do |a| [ a["href"].sub("/d/msg/", "/forum/message/raw?msg="), a["title"].empty? ] end.each do |msg_url, might_be_deleted| messsages_crawled |= crawl_message(msg_url, might_be_deleted) end @skipped_topic_count = skippable && messsages_crawled ? 0 : @skipped_topic_count + 1 @scraped_topic_urls << url rescue puts "Failed to scrape topic at #{url}".red raise if @abort_on_error end def crawl_message(url, might_be_deleted) get(url) filename = File.join(@path, "#{url[/#{@groupname}\/(.+)/, 1].sub("/", "-")}.eml") content = find("pre")["innerText"] if !@first_message_checked @first_message_checked = true if content.match?(/From:.*\.\.\.@.*/i) && !@force_import exit_with_error(<<~MSG.red.bold) It looks like you do not have permissions to see email addresses. Aborting. Use the --force option to import anyway. MSG end end old_md5 = Digest::MD5.file(filename) if File.exist?(filename) File.write(filename, content) old_md5 ? old_md5 != Digest::MD5.file(filename) : true rescue Selenium::WebDriver::Error::NoSuchElementError if might_be_deleted puts "Message might be deleted. Skipping #{url}" else puts "Failed to scrape message at #{url}".red raise if @abort_on_error end rescue puts "Failed to scrape message at #{url}".red raise if @abort_on_error end def login puts "Logging in..." get("https://google.com/404") add_cookies( "myaccount.google.com", "google.com" ) get("https://myaccount.google.com/?utm_source=sign_in_no_continue") begin wait_for_url { |url| url.start_with?("https://accounts.google.com") } rescue Selenium::WebDriver::Error::TimeoutError exit_with_error("Failed to login. Please check the content of your cookies.txt".red.bold) end end def add_cookies(*domains) File.readlines(@cookies).each do |line| parts = line.chomp.split("\t") next if parts.size != 7 || !domains.any? { |domain| parts[0] =~ /^\.?#{Regexp.escape(domain)}$/ } driver.manage.add_cookie( domain: parts[0], httpOnly: "true".casecmp?(parts[1]), path: parts[2], secure: "true".casecmp?(parts[3]), expires: parts[4] == "0" ? nil : DateTime.strptime(parts[4], "%s"), name: parts[5], value: parts[6] ) end end def wait_for_url wait = Selenium::WebDriver::Wait.new(timeout: 5) wait.until { yield(driver.current_url) } end def exit_with_error(*messages) STDERR.puts messages exit 1 end def crawl start_time = Time.now status_filename = File.join(@path, "status.yml") if File.exist?(status_filename) yaml = YAML.load_file(status_filename) @finished = yaml[:finished] @scraped_topic_urls = yaml[:urls] else @finished = false @scraped_topic_urls = Set.new end @skipped_topic_count = 0 login begin crawl_topics @finished = true ensure File.write(status_filename, { finished: @finished, urls: @scraped_topic_urls }.to_yaml) end elapsed = Time.now - start_time puts "", "", "Done (%02dh %02dmin %02dsec)" % [elapsed / 3600, elapsed / 60 % 60, elapsed % 60] end def parse_arguments puts "" # default values @force_import = false @abort_on_error = false @cookies = DEFAULT_COOKIES_TXT if File.exist?(DEFAULT_COOKIES_TXT) parser = OptionParser.new do |opts| opts.banner = "Usage: google_groups.rb [options]" opts.on("-g", "--groupname GROUPNAME") { |v| @groupname = v } opts.on("-d", "--domain DOMAIN") { |v| @domain = v } opts.on("-c", "--cookies PATH", "path to cookies.txt") { |v| @cookies = v } opts.on("--path PATH", "output path for emails") { |v| @path = v } opts.on("-f", "--force", "force import when user isn't allowed to see email addresses") { @force_import = true } opts.on("-a", "--abort-on-error", "abort crawl on error instead of skipping message") { @abort_on_error = true } opts.on("-h", "--help") do puts opts exit end end begin parser.parse! rescue OptionParser::ParseError => e exit_with_error(e.message, "", parser) end mandatory = [:groupname, :cookies] missing = mandatory.select { |name| instance_variable_get("@#{name}").nil? } exit_with_error("Missing arguments: #{missing.join(', ')}".red.bold, "", parser, "") if missing.any? exit_with_error("cookies.txt not found at #{@cookies}".red.bold, "") if !File.exist?(@cookies) @path = File.join(DEFAULT_OUTPUT_PATH, @groupname) if @path.nil? FileUtils.mkpath(@path) end parse_arguments crawl