2018-10-30 19:31:20 -04:00
|
|
|
#!/usr/bin/env ruby
|
2019-05-02 18:17:27 -04:00
|
|
|
# frozen_string_literal: true
|
2018-10-30 19:31:20 -04:00
|
|
|
|
|
|
|
require "bundler/inline"
|
|
|
|
|
|
|
|
gemfile(true) do
|
|
|
|
source "https://rubygems.org"
|
|
|
|
|
2019-03-25 11:10:37 -04:00
|
|
|
gem "webdrivers"
|
2019-09-18 07:07:07 -04:00
|
|
|
gem "colored2"
|
2018-10-30 19:31:20 -04:00
|
|
|
end
|
|
|
|
|
|
|
|
require "fileutils"
|
|
|
|
require "optparse"
|
|
|
|
require "set"
|
|
|
|
require "yaml"
|
|
|
|
|
|
|
|
DEFAULT_OUTPUT_PATH = "/shared/import/data"
|
2019-09-18 07:07:07 -04:00
|
|
|
DEFAULT_COOKIES_TXT = "/shared/import/cookies.txt"
|
2020-03-13 18:59:40 -04:00
|
|
|
ABORT_AFTER_SKIPPED_TOPIC_COUNT = 10
|
2018-10-30 19:31:20 -04:00
|
|
|
|
|
|
|
def driver
|
|
|
|
@driver ||= begin
|
2019-09-18 07:07:07 -04:00
|
|
|
chrome_args = ["disable-gpu"]
|
|
|
|
chrome_args << "headless" unless ENV["NOT_HEADLESS"] == '1'
|
2019-03-25 11:10:37 -04:00
|
|
|
chrome_args << "no-sandbox" if inside_container?
|
2018-10-30 19:31:20 -04:00
|
|
|
options = Selenium::WebDriver::Chrome::Options.new(args: chrome_args)
|
2019-07-10 03:45:25 -04:00
|
|
|
Selenium::WebDriver.for(:chrome, options: options)
|
2018-10-30 19:31:20 -04:00
|
|
|
end
|
|
|
|
end
|
|
|
|
|
|
|
|
def inside_container?
|
|
|
|
File.foreach("/proc/1/cgroup") do |line|
|
|
|
|
return true if line.include?("docker")
|
|
|
|
end
|
|
|
|
|
|
|
|
false
|
|
|
|
end
|
|
|
|
|
|
|
|
MAX_GET_RETRIES = 5
|
|
|
|
MAX_FIND_RETRIES = 3
|
|
|
|
|
|
|
|
def get(url)
|
|
|
|
begin
|
|
|
|
retries ||= 0
|
|
|
|
driver.get(url)
|
|
|
|
rescue Net::ReadTimeout
|
|
|
|
sleep retries
|
|
|
|
retry if (retries += 1) < MAX_GET_RETRIES
|
|
|
|
end
|
|
|
|
end
|
|
|
|
|
2018-10-30 20:30:14 -04:00
|
|
|
def extract(css, parent_element = driver)
|
2018-10-30 19:31:20 -04:00
|
|
|
begin
|
|
|
|
retries ||= 0
|
2018-10-30 20:30:14 -04:00
|
|
|
parent_element.find_elements(css: css).map { |element| yield(element) }
|
2018-10-30 19:31:20 -04:00
|
|
|
rescue Net::ReadTimeout, Selenium::WebDriver::Error::StaleElementReferenceError
|
|
|
|
sleep retries
|
|
|
|
retry if (retries += 1) < MAX_FIND_RETRIES
|
|
|
|
end
|
|
|
|
end
|
|
|
|
|
2018-10-30 20:30:14 -04:00
|
|
|
def find(css, parent_element = driver)
|
2018-10-30 19:31:20 -04:00
|
|
|
begin
|
|
|
|
retries ||= 0
|
2018-10-30 20:30:14 -04:00
|
|
|
parent_element.find_element(css: css)
|
2019-09-18 07:07:07 -04:00
|
|
|
rescue Net::ReadTimeout, Selenium::WebDriver::Error::ElementNotInteractableError
|
2018-10-30 19:31:20 -04:00
|
|
|
sleep retries
|
|
|
|
retry if (retries += 1) < MAX_FIND_RETRIES
|
|
|
|
end
|
|
|
|
end
|
|
|
|
|
2019-11-27 20:09:05 -05:00
|
|
|
def base_url
|
|
|
|
if @domain.nil?
|
|
|
|
"https://groups.google.com/forum/?_escaped_fragment_=categories"
|
|
|
|
else
|
|
|
|
"https://groups.google.com/a/#{@domain}/forum/?_escaped_fragment_=categories"
|
|
|
|
end
|
|
|
|
end
|
|
|
|
|
2020-03-13 18:59:40 -04:00
|
|
|
def crawl_topics
|
2018-10-30 19:31:20 -04:00
|
|
|
1.step(nil, 100).each do |start|
|
2019-11-27 20:09:05 -05:00
|
|
|
url = "#{base_url}/#{@groupname}[#{start}-#{start + 99}]"
|
2018-10-30 19:31:20 -04:00
|
|
|
get(url)
|
|
|
|
|
2019-11-27 20:09:05 -05:00
|
|
|
begin
|
|
|
|
if start == 1 && find("h2").text == "Error 403"
|
DEV: Correctly tag heredocs (#16061)
This allows text editors to use correct syntax coloring for the heredoc sections.
Heredoc tag names we use:
languages: SQL, JS, RUBY, LUA, HTML, CSS, SCSS, SH, HBS, XML, YAML/YML, MF, ICS
other: MD, TEXT/TXT, RAW, EMAIL
2022-02-28 14:50:55 -05:00
|
|
|
exit_with_error(<<~TEXT.red.bold)
|
2019-11-27 20:09:05 -05:00
|
|
|
Unable to find topics. Try running the script with the "--domain example.com"
|
|
|
|
option if you are a G Suite user and your group's URL contains a path with
|
|
|
|
your domain that looks like "/a/example.com".
|
DEV: Correctly tag heredocs (#16061)
This allows text editors to use correct syntax coloring for the heredoc sections.
Heredoc tag names we use:
languages: SQL, JS, RUBY, LUA, HTML, CSS, SCSS, SH, HBS, XML, YAML/YML, MF, ICS
other: MD, TEXT/TXT, RAW, EMAIL
2022-02-28 14:50:55 -05:00
|
|
|
TEXT
|
2019-11-27 20:09:05 -05:00
|
|
|
end
|
|
|
|
rescue Selenium::WebDriver::Error::NoSuchElementError
|
|
|
|
# Ignore this error. It simply means there wasn't an error.
|
|
|
|
end
|
|
|
|
|
2018-10-30 20:30:14 -04:00
|
|
|
topic_urls = extract(".subject a[href*='#{@groupname}']") { |a| a["href"].sub("/d/topic/", "/forum/?_escaped_fragment_=topic/") }
|
|
|
|
break if topic_urls.size == 0
|
2018-10-30 19:31:20 -04:00
|
|
|
|
2020-03-13 18:59:40 -04:00
|
|
|
topic_urls.each do |topic_url|
|
|
|
|
crawl_topic(topic_url)
|
|
|
|
|
|
|
|
# abort if this in an incremental crawl and there were too many consecutive, skipped topics
|
|
|
|
if @finished && @skipped_topic_count > ABORT_AFTER_SKIPPED_TOPIC_COUNT
|
|
|
|
puts "Skipping all other topics, because this is an incremental crawl.".green
|
|
|
|
return
|
|
|
|
end
|
|
|
|
end
|
2018-10-30 19:31:20 -04:00
|
|
|
end
|
|
|
|
end
|
|
|
|
|
|
|
|
def crawl_topic(url)
|
2020-03-13 18:59:40 -04:00
|
|
|
skippable = @scraped_topic_urls.include?(url)
|
|
|
|
|
|
|
|
# Skip this topic if there were already too many consecutive, skipped topics.
|
|
|
|
# Otherwise we have to look if there are new messages in the topic.
|
|
|
|
if skippable && @skipped_topic_count > ABORT_AFTER_SKIPPED_TOPIC_COUNT
|
2019-09-18 07:07:07 -04:00
|
|
|
puts "Skipping".green << " #{url}"
|
2018-10-30 19:31:20 -04:00
|
|
|
return
|
|
|
|
end
|
|
|
|
|
|
|
|
puts "Scraping #{url}"
|
|
|
|
get(url)
|
|
|
|
|
2022-02-27 21:20:58 -05:00
|
|
|
messages_crawled = false
|
2020-03-13 18:59:40 -04:00
|
|
|
|
2018-10-30 19:31:20 -04:00
|
|
|
extract(".subject a[href*='#{@groupname}']") do |a|
|
|
|
|
[
|
|
|
|
a["href"].sub("/d/msg/", "/forum/message/raw?msg="),
|
|
|
|
a["title"].empty?
|
|
|
|
]
|
2020-03-13 18:59:40 -04:00
|
|
|
end.each do |msg_url, might_be_deleted|
|
2022-02-27 21:20:58 -05:00
|
|
|
messages_crawled |= crawl_message(msg_url, might_be_deleted)
|
2020-03-13 18:59:40 -04:00
|
|
|
end
|
2018-10-30 19:31:20 -04:00
|
|
|
|
2022-02-27 21:20:58 -05:00
|
|
|
@skipped_topic_count = skippable && messages_crawled ? 0 : @skipped_topic_count + 1
|
2018-10-30 19:31:20 -04:00
|
|
|
@scraped_topic_urls << url
|
|
|
|
rescue
|
2019-09-18 07:07:07 -04:00
|
|
|
puts "Failed to scrape topic at #{url}".red
|
2019-09-18 12:11:52 -04:00
|
|
|
raise if @abort_on_error
|
2018-10-30 19:31:20 -04:00
|
|
|
end
|
|
|
|
|
|
|
|
def crawl_message(url, might_be_deleted)
|
|
|
|
get(url)
|
|
|
|
|
|
|
|
filename = File.join(@path, "#{url[/#{@groupname}\/(.+)/, 1].sub("/", "-")}.eml")
|
|
|
|
content = find("pre")["innerText"]
|
|
|
|
|
2019-03-24 18:08:03 -04:00
|
|
|
if !@first_message_checked
|
|
|
|
@first_message_checked = true
|
|
|
|
|
|
|
|
if content.match?(/From:.*\.\.\.@.*/i) && !@force_import
|
DEV: Correctly tag heredocs (#16061)
This allows text editors to use correct syntax coloring for the heredoc sections.
Heredoc tag names we use:
languages: SQL, JS, RUBY, LUA, HTML, CSS, SCSS, SH, HBS, XML, YAML/YML, MF, ICS
other: MD, TEXT/TXT, RAW, EMAIL
2022-02-28 14:50:55 -05:00
|
|
|
exit_with_error(<<~TEXT.red.bold)
|
2019-03-24 18:08:03 -04:00
|
|
|
It looks like you do not have permissions to see email addresses. Aborting.
|
|
|
|
Use the --force option to import anyway.
|
DEV: Correctly tag heredocs (#16061)
This allows text editors to use correct syntax coloring for the heredoc sections.
Heredoc tag names we use:
languages: SQL, JS, RUBY, LUA, HTML, CSS, SCSS, SH, HBS, XML, YAML/YML, MF, ICS
other: MD, TEXT/TXT, RAW, EMAIL
2022-02-28 14:50:55 -05:00
|
|
|
TEXT
|
2019-03-24 18:08:03 -04:00
|
|
|
end
|
|
|
|
end
|
|
|
|
|
2020-03-13 18:59:40 -04:00
|
|
|
old_md5 = Digest::MD5.file(filename) if File.exist?(filename)
|
2018-10-30 19:31:20 -04:00
|
|
|
File.write(filename, content)
|
2020-03-13 18:59:40 -04:00
|
|
|
|
|
|
|
old_md5 ? old_md5 != Digest::MD5.file(filename) : true
|
2018-10-30 19:31:20 -04:00
|
|
|
rescue Selenium::WebDriver::Error::NoSuchElementError
|
2019-09-18 12:11:52 -04:00
|
|
|
if might_be_deleted
|
|
|
|
puts "Message might be deleted. Skipping #{url}"
|
|
|
|
else
|
|
|
|
puts "Failed to scrape message at #{url}".red
|
|
|
|
raise if @abort_on_error
|
|
|
|
end
|
2018-10-30 19:31:20 -04:00
|
|
|
rescue
|
2019-09-18 07:07:07 -04:00
|
|
|
puts "Failed to scrape message at #{url}".red
|
2019-09-18 12:11:52 -04:00
|
|
|
raise if @abort_on_error
|
2018-10-30 19:31:20 -04:00
|
|
|
end
|
|
|
|
|
|
|
|
def login
|
|
|
|
puts "Logging in..."
|
2019-09-18 07:07:07 -04:00
|
|
|
get("https://google.com/404")
|
|
|
|
|
|
|
|
add_cookies(
|
|
|
|
"myaccount.google.com",
|
|
|
|
"google.com"
|
|
|
|
)
|
|
|
|
|
2020-03-02 11:24:48 -05:00
|
|
|
get("https://myaccount.google.com/?utm_source=sign_in_no_continue")
|
2019-09-18 07:07:07 -04:00
|
|
|
|
|
|
|
begin
|
2020-04-15 16:45:14 -04:00
|
|
|
wait_for_url { |url| url.start_with?("https://accounts.google.com") }
|
2019-09-18 07:07:07 -04:00
|
|
|
rescue Selenium::WebDriver::Error::TimeoutError
|
|
|
|
exit_with_error("Failed to login. Please check the content of your cookies.txt".red.bold)
|
2019-03-24 18:08:03 -04:00
|
|
|
end
|
2019-09-18 07:07:07 -04:00
|
|
|
end
|
2019-03-24 18:08:03 -04:00
|
|
|
|
2019-09-18 07:07:07 -04:00
|
|
|
def add_cookies(*domains)
|
|
|
|
File.readlines(@cookies).each do |line|
|
|
|
|
parts = line.chomp.split("\t")
|
|
|
|
next if parts.size != 7 || !domains.any? { |domain| parts[0] =~ /^\.?#{Regexp.escape(domain)}$/ }
|
|
|
|
|
|
|
|
driver.manage.add_cookie(
|
|
|
|
domain: parts[0],
|
|
|
|
httpOnly: "true".casecmp?(parts[1]),
|
|
|
|
path: parts[2],
|
|
|
|
secure: "true".casecmp?(parts[3]),
|
|
|
|
expires: parts[4] == "0" ? nil : DateTime.strptime(parts[4], "%s"),
|
|
|
|
name: parts[5],
|
|
|
|
value: parts[6]
|
|
|
|
)
|
|
|
|
end
|
2019-03-24 18:08:03 -04:00
|
|
|
end
|
|
|
|
|
|
|
|
def wait_for_url
|
|
|
|
wait = Selenium::WebDriver::Wait.new(timeout: 5)
|
|
|
|
wait.until { yield(driver.current_url) }
|
|
|
|
end
|
|
|
|
|
2019-03-25 11:10:37 -04:00
|
|
|
def exit_with_error(*messages)
|
|
|
|
STDERR.puts messages
|
2019-03-24 18:08:03 -04:00
|
|
|
exit 1
|
2018-10-30 19:31:20 -04:00
|
|
|
end
|
|
|
|
|
|
|
|
def crawl
|
|
|
|
start_time = Time.now
|
|
|
|
status_filename = File.join(@path, "status.yml")
|
2020-03-13 18:59:40 -04:00
|
|
|
|
2022-01-05 12:45:08 -05:00
|
|
|
if File.exist?(status_filename)
|
2020-03-13 18:59:40 -04:00
|
|
|
yaml = YAML.load_file(status_filename)
|
|
|
|
@finished = yaml[:finished]
|
|
|
|
@scraped_topic_urls = yaml[:urls]
|
|
|
|
else
|
|
|
|
@finished = false
|
|
|
|
@scraped_topic_urls = Set.new
|
|
|
|
end
|
|
|
|
|
|
|
|
@skipped_topic_count = 0
|
2018-10-30 19:31:20 -04:00
|
|
|
|
|
|
|
login
|
|
|
|
|
|
|
|
begin
|
2020-03-13 18:59:40 -04:00
|
|
|
crawl_topics
|
|
|
|
@finished = true
|
2018-10-30 19:31:20 -04:00
|
|
|
ensure
|
2020-03-13 18:59:40 -04:00
|
|
|
File.write(status_filename, {
|
|
|
|
finished: @finished,
|
|
|
|
urls: @scraped_topic_urls
|
|
|
|
}.to_yaml)
|
2018-10-30 19:31:20 -04:00
|
|
|
end
|
|
|
|
|
|
|
|
elapsed = Time.now - start_time
|
|
|
|
puts "", "", "Done (%02dh %02dmin %02dsec)" % [elapsed / 3600, elapsed / 60 % 60, elapsed % 60]
|
|
|
|
end
|
|
|
|
|
|
|
|
def parse_arguments
|
|
|
|
puts ""
|
|
|
|
|
2019-09-18 12:11:52 -04:00
|
|
|
# default values
|
2019-03-24 18:08:03 -04:00
|
|
|
@force_import = false
|
2019-09-18 12:11:52 -04:00
|
|
|
@abort_on_error = false
|
|
|
|
@cookies = DEFAULT_COOKIES_TXT if File.exist?(DEFAULT_COOKIES_TXT)
|
2019-03-24 18:08:03 -04:00
|
|
|
|
2018-10-30 19:31:20 -04:00
|
|
|
parser = OptionParser.new do |opts|
|
|
|
|
opts.banner = "Usage: google_groups.rb [options]"
|
|
|
|
|
|
|
|
opts.on("-g", "--groupname GROUPNAME") { |v| @groupname = v }
|
2019-11-27 20:09:05 -05:00
|
|
|
opts.on("-d", "--domain DOMAIN") { |v| @domain = v }
|
2019-09-18 07:07:07 -04:00
|
|
|
opts.on("-c", "--cookies PATH", "path to cookies.txt") { |v| @cookies = v }
|
2018-10-30 19:31:20 -04:00
|
|
|
opts.on("--path PATH", "output path for emails") { |v| @path = v }
|
2019-03-24 18:08:03 -04:00
|
|
|
opts.on("-f", "--force", "force import when user isn't allowed to see email addresses") { @force_import = true }
|
2019-09-18 12:11:52 -04:00
|
|
|
opts.on("-a", "--abort-on-error", "abort crawl on error instead of skipping message") { @abort_on_error = true }
|
2018-10-30 19:31:20 -04:00
|
|
|
opts.on("-h", "--help") do
|
|
|
|
puts opts
|
|
|
|
exit
|
|
|
|
end
|
|
|
|
end
|
|
|
|
|
|
|
|
begin
|
|
|
|
parser.parse!
|
|
|
|
rescue OptionParser::ParseError => e
|
2019-03-25 11:10:37 -04:00
|
|
|
exit_with_error(e.message, "", parser)
|
2018-10-30 19:31:20 -04:00
|
|
|
end
|
|
|
|
|
2019-09-18 07:07:07 -04:00
|
|
|
mandatory = [:groupname, :cookies]
|
2018-10-30 19:31:20 -04:00
|
|
|
missing = mandatory.select { |name| instance_variable_get("@#{name}").nil? }
|
|
|
|
|
2019-09-18 07:07:07 -04:00
|
|
|
exit_with_error("Missing arguments: #{missing.join(', ')}".red.bold, "", parser, "") if missing.any?
|
|
|
|
exit_with_error("cookies.txt not found at #{@cookies}".red.bold, "") if !File.exist?(@cookies)
|
2018-10-30 19:31:20 -04:00
|
|
|
|
2019-09-18 12:11:52 -04:00
|
|
|
@path = File.join(DEFAULT_OUTPUT_PATH, @groupname) if @path.nil?
|
2018-10-30 19:31:20 -04:00
|
|
|
FileUtils.mkpath(@path)
|
|
|
|
end
|
|
|
|
|
|
|
|
parse_arguments
|
|
|
|
crawl
|