discourse/script/import_scripts/mbox/support/indexer.rb

# frozen_string_literal: true

require_relative 'database'
require 'json'
require 'yaml'

module ImportScripts::Mbox
  class Indexer
    # @param database [ImportScripts::Mbox::Database]
    # @param settings [ImportScripts::Mbox::Settings]
    def initialize(database, settings)
      @database = database
      @settings = settings
      @split_regex = settings.split_regex
    end

    def execute
      directories = Dir.glob(File.join(@settings.data_dir, '*'))
      directories.select! { |f| File.directory?(f) }
      directories.sort!

      directories.each do |directory|
        puts "indexing files in #{directory}"
        category = index_category(directory)
        index_emails(directory, category[:name])
      end

      puts '', 'indexing replies and users'
      if @settings.group_messages_by_subject
        @database.sort_emails_by_subject
        @database.update_in_reply_to_by_email_subject
      else
        @database.update_in_reply_to_of_emails
        @database.sort_emails_by_date_and_reply_level
      end

      @database.fill_users_from_emails
    end

    private

    METADATA_FILENAME = 'metadata.yml'
    IGNORED_FILE_EXTENSIONS = ['.dbindex', '.dbnames', '.digest', '.subjects', '.yml']

    def index_category(directory)
      metadata_file = File.join(directory, METADATA_FILENAME)

      if File.exist?(metadata_file)
        # workaround for YML files that contain classname in file header
        yaml = File.read(metadata_file).sub(/^--- !.*$/, '---')
        metadata = YAML.load(yaml)
      else
        metadata = {}
      end

      category = {
        name: metadata['name'].presence || File.basename(directory),
        description: metadata['description']
      }

      @database.insert_category(category)
      category
    end

    def index_emails(directory, category_name)
      all_messages(directory, category_name) do |receiver, filename, opts|
        begin
          msg_id = receiver.message_id
          parsed_email = receiver.mail
          from_email, from_display_name = receiver.parse_from_field(parsed_email)
          body, elided, format = receiver.select_body
          reply_message_ids = extract_reply_message_ids(parsed_email)

          email = {
            msg_id: msg_id,
            from_email: from_email,
            from_name: from_display_name,
            subject: extract_subject(receiver, category_name),
            email_date: timestamp(parsed_email.date),
            raw_message: receiver.raw_email,
            body: body,
            elided: elided,
            format: format,
            attachment_count: receiver.attachments.count,
            charset: parsed_email.charset&.downcase,
            category: category_name,
            filename: File.basename(filename),
            first_line_number: opts[:first_line_number],
            last_line_number: opts[:last_line_number],
            index_duration: (monotonic_time - opts[:start_time]).round(4)
          }

          @database.transaction do |db|
            db.insert_email(email)
            db.insert_replies(msg_id, reply_message_ids) unless reply_message_ids.empty?
          end
        rescue StandardError => e
          if opts[:first_line_number] && opts[:last_line_number]
            STDERR.puts "Failed to index message in #{filename} at lines #{opts[:first_line_number]}-#{opts[:last_line_number]}"
          else
            STDERR.puts "Failed to index message in #{filename}"
          end

          STDERR.puts e.message
          STDERR.puts e.backtrace.inspect
        end
      end
    end

    def imported_file_checksums(category_name)
      rows = @database.fetch_imported_files(category_name)
      rows.each_with_object({}) do |row, hash|
        filename = File.basename(row['filename'])
        hash[filename] = row['checksum']
      end
    end

    def all_messages(directory, category_name)
      checksums = imported_file_checksums(category_name)

      Dir.foreach(directory) do |filename|
        filename = File.join(directory, filename)
        next if ignored_file?(filename, checksums)

        puts "indexing #{filename}"

        if @split_regex.present?
          each_mail(filename) do |raw_message, first_line_number, last_line_number|
            opts = {
              first_line_number: first_line_number,
              last_line_number: last_line_number,
              start_time: monotonic_time
            }
            receiver = read_mail_from_string(raw_message)
            yield receiver, filename, opts if receiver.present?
          end
        else
          opts = { start_time: monotonic_time }
          receiver = read_mail_from_file(filename)
          yield receiver, filename, opts if receiver.present?
        end

        mark_as_fully_indexed(category_name, filename)
      end
    end

    def mark_as_fully_indexed(category_name, filename)
      imported_file = {
        category: category_name,
        filename: File.basename(filename),
        checksum: calc_checksum(filename)
      }

      @database.insert_imported_file(imported_file)
    end

    def each_mail(filename)
      raw_message = +''
      first_line_number = 1
      last_line_number = 0

      each_line(filename) do |line|
        line = line.scrub

        if line =~ @split_regex
          if last_line_number > 0
            yield raw_message, first_line_number, last_line_number
            raw_message = +''
            first_line_number = last_line_number + 1
          end
        else
          raw_message << line
        end

        last_line_number += 1
      end

      yield raw_message, first_line_number, last_line_number if raw_message.present?
    end

    def each_line(filename)
      raw_file = File.open(filename, 'r')
      text_file = filename.end_with?('.gz') ? Zlib::GzipReader.new(raw_file) : raw_file

      text_file.each_line do |line|
        yield line
      end
    ensure
      raw_file.close if raw_file
    end

    def read_mail_from_file(filename)
      raw_message = File.read(filename)
      read_mail_from_string(raw_message)
    end

    def read_mail_from_string(raw_message)
      Email::Receiver.new(raw_message, convert_plaintext: true, skip_trimming: false) unless raw_message.blank?
    end

    def extract_reply_message_ids(mail)
      Email::Receiver.extract_reply_message_ids(mail, max_message_id_count: 20)
    end

    def extract_subject(receiver, list_name)
      subject = receiver.subject
      subject.blank? ? nil : subject.strip
    end

    def ignored_file?(path, checksums)
      filename = File.basename(path)

      filename.start_with?('.') ||
        filename == METADATA_FILENAME ||
        IGNORED_FILE_EXTENSIONS.include?(File.extname(filename)) ||
        fully_indexed?(path, filename, checksums)
    end

    def fully_indexed?(path, filename, checksums)
      checksum = checksums[filename]
      checksum.present? && calc_checksum(path) == checksum
    end

    def calc_checksum(filename)
      Digest::SHA256.file(filename).hexdigest
    end

    def monotonic_time
      Process.clock_gettime(Process::CLOCK_MONOTONIC)
    end

    def timestamp(datetime)
      Time.zone.at(datetime).to_i if datetime
    end
  end
end
DEV: enable frozen string literal on all files This reduces chances of errors where consumers of strings mutate inputs and reduces memory usage of the app. Test suite passes now, but there may be some stuff left, so we will run a few sites on a branch prior to merging 2019-05-02 18:17:27 -04:00			`# frozen_string_literal: true`

Add new, experimental version of mbox importer 2017-05-26 16:26:18 -04:00			`require_relative 'database'`
			`require 'json'`
			`require 'yaml'`

			`module ImportScripts::Mbox`
			`class Indexer`
			`# @param database [ImportScripts::Mbox::Database]`
			`# @param settings [ImportScripts::Mbox::Settings]`
			`def initialize(database, settings)`
			`@database = database`
Improvements to mbox importer * store time it took to index message in DB (to find performance issues) * ignore listserv specific files * better examples for split_regex * first email in mbox shouldn't contain the split string * always lock the DB in exclusive mode * save email within transaction * messages can be grouped by subject and use original order (for Listserv) * adds option to index emails without running the import 2018-01-17 06:03:57 -05:00			`@settings = settings`
Add new, experimental version of mbox importer 2017-05-26 16:26:18 -04:00			`@split_regex = settings.split_regex`
			`end`

			`def execute`
Improvements to mbox importer * store time it took to index message in DB (to find performance issues) * ignore listserv specific files * better examples for split_regex * first email in mbox shouldn't contain the split string * always lock the DB in exclusive mode * save email within transaction * messages can be grouped by subject and use original order (for Listserv) * adds option to index emails without running the import 2018-01-17 06:03:57 -05:00			`directories = Dir.glob(File.join(@settings.data_dir, '*'))`
Add new, experimental version of mbox importer 2017-05-26 16:26:18 -04:00			`directories.select! { \|f\| File.directory?(f) }`
			`directories.sort!`

			`directories.each do \|directory\|`
			`puts "indexing files in #{directory}"`
			`category = index_category(directory)`
			`index_emails(directory, category[:name])`
			`end`

			`puts '', 'indexing replies and users'`
Improvements to mbox importer * store time it took to index message in DB (to find performance issues) * ignore listserv specific files * better examples for split_regex * first email in mbox shouldn't contain the split string * always lock the DB in exclusive mode * save email within transaction * messages can be grouped by subject and use original order (for Listserv) * adds option to index emails without running the import 2018-01-17 06:03:57 -05:00			`if @settings.group_messages_by_subject`
			`@database.sort_emails_by_subject`
			`@database.update_in_reply_to_by_email_subject`
			`else`
			`@database.update_in_reply_to_of_emails`
			`@database.sort_emails_by_date_and_reply_level`
			`end`

Add new, experimental version of mbox importer 2017-05-26 16:26:18 -04:00			`@database.fill_users_from_emails`
			`end`

			`private`

DEV: stop freezing frozen strings We have the `# frozen_string_literal: true` comment on all our files. This means all string literals are frozen. There is no need to call #freeze on any literals. For files with `# frozen_string_literal: true` ``` puts %w{a b}[0].frozen? => true puts "hi".frozen? => true puts "a #{1} b".frozen? => true puts ("a " + "b").frozen? => false puts (-("a " + "b")).frozen? => true ``` For more details see: https://samsaffron.com/archive/2018/02/16/reducing-string-duplication-in-ruby 2020-04-30 02:48:34 -04:00			`METADATA_FILENAME = 'metadata.yml'`
Improve Google Groups scraper * Better error detection during login phase * Experimental support for 2FA and SMS codes * Detect missing permissions to scrape email addresses 2019-03-24 18:08:03 -04:00			`IGNORED_FILE_EXTENSIONS = ['.dbindex', '.dbnames', '.digest', '.subjects', '.yml']`
Add new, experimental version of mbox importer 2017-05-26 16:26:18 -04:00
			`def index_category(directory)`
			`metadata_file = File.join(directory, METADATA_FILENAME)`

			`if File.exist?(metadata_file)`
			`# workaround for YML files that contain classname in file header`
			`yaml = File.read(metadata_file).sub(/^--- !.*$/, '---')`
			`metadata = YAML.load(yaml)`
			`else`
			`metadata = {}`
			`end`

			`category = {`
			`name: metadata['name'].presence \|\| File.basename(directory),`
			`description: metadata['description']`
			`}`

			`@database.insert_category(category)`
			`category`
			`end`

			`def index_emails(directory, category_name)`
Improvements to mbox importer * store time it took to index message in DB (to find performance issues) * ignore listserv specific files * better examples for split_regex * first email in mbox shouldn't contain the split string * always lock the DB in exclusive mode * save email within transaction * messages can be grouped by subject and use original order (for Listserv) * adds option to index emails without running the import 2018-01-17 06:03:57 -05:00			`all_messages(directory, category_name) do \|receiver, filename, opts\|`
Improvements to mbox import script * Ignore errors during indexing and show information about the message causing the problem * Always activate imported users if they aren't staged 2018-03-06 05:32:12 -05:00			`begin`
			`msg_id = receiver.message_id`
			`parsed_email = receiver.mail`
			`from_email, from_display_name = receiver.parse_from_field(parsed_email)`
			`body, elided, format = receiver.select_body`
			`reply_message_ids = extract_reply_message_ids(parsed_email)`

			`email = {`
			`msg_id: msg_id,`
			`from_email: from_email,`
			`from_name: from_display_name,`
			`subject: extract_subject(receiver, category_name),`
Improve mbox import script * emails weren't sorted in correct order * better default regex for splitting mbox files * output Message-ID if email is skipped because it doesn't have a Date 2018-08-23 03:46:25 -04:00			`email_date: timestamp(parsed_email.date),`
Improvements to mbox import script * Ignore errors during indexing and show information about the message causing the problem * Always activate imported users if they aren't staged 2018-03-06 05:32:12 -05:00			`raw_message: receiver.raw_email,`
			`body: body,`
			`elided: elided,`
			`format: format,`
			`attachment_count: receiver.attachments.count,`
			`charset: parsed_email.charset&.downcase,`
			`category: category_name,`
			`filename: File.basename(filename),`
			`first_line_number: opts[:first_line_number],`
			`last_line_number: opts[:last_line_number],`
			`index_duration: (monotonic_time - opts[:start_time]).round(4)`
			`}`

			`@database.transaction do \|db\|`
			`db.insert_email(email)`
			`db.insert_replies(msg_id, reply_message_ids) unless reply_message_ids.empty?`
			`end`
			`rescue StandardError => e`
			`if opts[:first_line_number] && opts[:last_line_number]`
			`STDERR.puts "Failed to index message in #{filename} at lines #{opts[:first_line_number]}-#{opts[:last_line_number]}"`
			`else`
			`STDERR.puts "Failed to index message in #{filename}"`
			`end`

			`STDERR.puts e.message`
			`STDERR.puts e.backtrace.inspect`
Improvements to mbox importer * store time it took to index message in DB (to find performance issues) * ignore listserv specific files * better examples for split_regex * first email in mbox shouldn't contain the split string * always lock the DB in exclusive mode * save email within transaction * messages can be grouped by subject and use original order (for Listserv) * adds option to index emails without running the import 2018-01-17 06:03:57 -05:00			`end`
Add new, experimental version of mbox importer 2017-05-26 16:26:18 -04:00			`end`
			`end`

			`def imported_file_checksums(category_name)`
			`rows = @database.fetch_imported_files(category_name)`
			`rows.each_with_object({}) do \|row, hash\|`
FIX: mbox importer didn't detected already indexed files 2018-01-17 11:03:36 -05:00			`filename = File.basename(row['filename'])`
			`hash[filename] = row['checksum']`
Add new, experimental version of mbox importer 2017-05-26 16:26:18 -04:00			`end`
			`end`

			`def all_messages(directory, category_name)`
			`checksums = imported_file_checksums(category_name)`

			`Dir.foreach(directory) do \|filename\|`
			`filename = File.join(directory, filename)`
			`next if ignored_file?(filename, checksums)`

			`puts "indexing #{filename}"`

			`if @split_regex.present?`
			`each_mail(filename) do \|raw_message, first_line_number, last_line_number\|`
Improvements to mbox importer * store time it took to index message in DB (to find performance issues) * ignore listserv specific files * better examples for split_regex * first email in mbox shouldn't contain the split string * always lock the DB in exclusive mode * save email within transaction * messages can be grouped by subject and use original order (for Listserv) * adds option to index emails without running the import 2018-01-17 06:03:57 -05:00			`opts = {`
			`first_line_number: first_line_number,`
			`last_line_number: last_line_number,`
			`start_time: monotonic_time`
			`}`
improvements to the mbox import script * ignores dot-files and empty emails * new setting to prefer HTML over plaintext emails during import * restore original site settings at the end of import * elided content of HTML mails was not put inside details block 2017-11-18 07:53:21 -05:00			`receiver = read_mail_from_string(raw_message)`
Improvements to mbox importer * store time it took to index message in DB (to find performance issues) * ignore listserv specific files * better examples for split_regex * first email in mbox shouldn't contain the split string * always lock the DB in exclusive mode * save email within transaction * messages can be grouped by subject and use original order (for Listserv) * adds option to index emails without running the import 2018-01-17 06:03:57 -05:00			`yield receiver, filename, opts if receiver.present?`
Add new, experimental version of mbox importer 2017-05-26 16:26:18 -04:00			`end`
			`else`
Improvements to mbox importer * store time it took to index message in DB (to find performance issues) * ignore listserv specific files * better examples for split_regex * first email in mbox shouldn't contain the split string * always lock the DB in exclusive mode * save email within transaction * messages can be grouped by subject and use original order (for Listserv) * adds option to index emails without running the import 2018-01-17 06:03:57 -05:00			`opts = { start_time: monotonic_time }`
improvements to the mbox import script * ignores dot-files and empty emails * new setting to prefer HTML over plaintext emails during import * restore original site settings at the end of import * elided content of HTML mails was not put inside details block 2017-11-18 07:53:21 -05:00			`receiver = read_mail_from_file(filename)`
Improvements to mbox importer * store time it took to index message in DB (to find performance issues) * ignore listserv specific files * better examples for split_regex * first email in mbox shouldn't contain the split string * always lock the DB in exclusive mode * save email within transaction * messages can be grouped by subject and use original order (for Listserv) * adds option to index emails without running the import 2018-01-17 06:03:57 -05:00			`yield receiver, filename, opts if receiver.present?`
Add new, experimental version of mbox importer 2017-05-26 16:26:18 -04:00			`end`

			`mark_as_fully_indexed(category_name, filename)`
			`end`
			`end`

			`def mark_as_fully_indexed(category_name, filename)`
			`imported_file = {`
			`category: category_name,`
FIX: mbox importer didn't detected already indexed files 2018-01-17 11:03:36 -05:00			`filename: File.basename(filename),`
Add new, experimental version of mbox importer 2017-05-26 16:26:18 -04:00			`checksum: calc_checksum(filename)`
			`}`

			`@database.insert_imported_file(imported_file)`
			`end`

			`def each_mail(filename)`
Make import scripts work with frozen strings 2019-05-30 16:20:57 -04:00			`raw_message = +''`
Add new, experimental version of mbox importer 2017-05-26 16:26:18 -04:00			`first_line_number = 1`
			`last_line_number = 0`

			`each_line(filename) do \|line\|`
			`line = line.scrub`

Improvements to mbox importer * store time it took to index message in DB (to find performance issues) * ignore listserv specific files * better examples for split_regex * first email in mbox shouldn't contain the split string * always lock the DB in exclusive mode * save email within transaction * messages can be grouped by subject and use original order (for Listserv) * adds option to index emails without running the import 2018-01-17 06:03:57 -05:00			`if line =~ @split_regex`
			`if last_line_number > 0`
			`yield raw_message, first_line_number, last_line_number`
Make import scripts work with frozen strings 2019-05-30 16:20:57 -04:00			`raw_message = +''`
Improvements to mbox importer * store time it took to index message in DB (to find performance issues) * ignore listserv specific files * better examples for split_regex * first email in mbox shouldn't contain the split string * always lock the DB in exclusive mode * save email within transaction * messages can be grouped by subject and use original order (for Listserv) * adds option to index emails without running the import 2018-01-17 06:03:57 -05:00			`first_line_number = last_line_number + 1`
			`end`
Add new, experimental version of mbox importer 2017-05-26 16:26:18 -04:00			`else`
			`raw_message << line`
			`end`

			`last_line_number += 1`
			`end`

			`yield raw_message, first_line_number, last_line_number if raw_message.present?`
			`end`

			`def each_line(filename)`
			`raw_file = File.open(filename, 'r')`
			`text_file = filename.end_with?('.gz') ? Zlib::GzipReader.new(raw_file) : raw_file`

			`text_file.each_line do \|line\|`
			`yield line`
			`end`
			`ensure`
			`raw_file.close if raw_file`
			`end`

			`def read_mail_from_file(filename)`
			`raw_message = File.read(filename)`
			`read_mail_from_string(raw_message)`
			`end`

			`def read_mail_from_string(raw_message)`
Improvements to mbox importer * store time it took to index message in DB (to find performance issues) * ignore listserv specific files * better examples for split_regex * first email in mbox shouldn't contain the split string * always lock the DB in exclusive mode * save email within transaction * messages can be grouped by subject and use original order (for Listserv) * adds option to index emails without running the import 2018-01-17 06:03:57 -05:00			`Email::Receiver.new(raw_message, convert_plaintext: true, skip_trimming: false) unless raw_message.blank?`
Add new, experimental version of mbox importer 2017-05-26 16:26:18 -04:00			`end`

			`def extract_reply_message_ids(mail)`
FEATURE: Use Message-ID for detecting email replies to group Ignores the site setting "find_related_post_with_key" and always tries to honor the `In-Reply-To` and `References` header for emails sent to a group. The senders email address must be included in the `To` or `CC` header of a previous email sent to the group and the `Message-ID` of that email must be included in the current email's `In-Reply-To` or `References` header. 2018-03-30 08:37:19 -04:00			`Email::Receiver.extract_reply_message_ids(mail, max_message_id_count: 20)`
Add new, experimental version of mbox importer 2017-05-26 16:26:18 -04:00			`end`

			`def extract_subject(receiver, list_name)`
			`subject = receiver.subject`
DEV: Improve mbox import script * Customizable email subject prefixes to remove "Re" and "Fwd" as well as localized prefixes. * Configuration option for prefixes like [FOO] or (BAR) which can be replaced with tags during import. * Bugfix: Import script might have skipped some users due to missing ORDER BY. 2020-03-07 19:20:02 -05:00			`subject.blank? ? nil : subject.strip`
Add new, experimental version of mbox importer 2017-05-26 16:26:18 -04:00			`end`

FIX: mbox importer didn't detected already indexed files 2018-01-17 11:03:36 -05:00			`def ignored_file?(path, checksums)`
			`filename = File.basename(path)`
Add new, experimental version of mbox importer 2017-05-26 16:26:18 -04:00
Improvements to mbox importer * store time it took to index message in DB (to find performance issues) * ignore listserv specific files * better examples for split_regex * first email in mbox shouldn't contain the split string * always lock the DB in exclusive mode * save email within transaction * messages can be grouped by subject and use original order (for Listserv) * adds option to index emails without running the import 2018-01-17 06:03:57 -05:00			`filename.start_with?('.') \|\|`
			`filename == METADATA_FILENAME \|\|`
			`IGNORED_FILE_EXTENSIONS.include?(File.extname(filename)) \|\|`
FIX: mbox importer didn't detected already indexed files 2018-01-17 11:03:36 -05:00			`fully_indexed?(path, filename, checksums)`
Add new, experimental version of mbox importer 2017-05-26 16:26:18 -04:00			`end`

FIX: mbox importer didn't detected already indexed files 2018-01-17 11:03:36 -05:00			`def fully_indexed?(path, filename, checksums)`
Add new, experimental version of mbox importer 2017-05-26 16:26:18 -04:00			`checksum = checksums[filename]`
FIX: mbox importer didn't detected already indexed files 2018-01-17 11:03:36 -05:00			`checksum.present? && calc_checksum(path) == checksum`
Add new, experimental version of mbox importer 2017-05-26 16:26:18 -04:00			`end`

			`def calc_checksum(filename)`
			`Digest::SHA256.file(filename).hexdigest`
			`end`
Improvements to mbox importer * store time it took to index message in DB (to find performance issues) * ignore listserv specific files * better examples for split_regex * first email in mbox shouldn't contain the split string * always lock the DB in exclusive mode * save email within transaction * messages can be grouped by subject and use original order (for Listserv) * adds option to index emails without running the import 2018-01-17 06:03:57 -05:00
			`def monotonic_time`
			`Process.clock_gettime(Process::CLOCK_MONOTONIC)`
			`end`
Improve mbox import script * emails weren't sorted in correct order * better default regex for splitting mbox files * output Message-ID if email is skipped because it doesn't have a Date 2018-08-23 03:46:25 -04:00
			`def timestamp(datetime)`
			`Time.zone.at(datetime).to_i if datetime`
			`end`
Add new, experimental version of mbox importer 2017-05-26 16:26:18 -04:00			`end`
			`end`