discourse/script/import_scripts/mbox/support/indexer.rb

require_relative 'database'
require 'json'
require 'yaml'

module ImportScripts::Mbox
  class Indexer
    # @param database [ImportScripts::Mbox::Database]
    # @param settings [ImportScripts::Mbox::Settings]
    def initialize(database, settings)
      @database = database
      @settings = settings
      @split_regex = settings.split_regex
    end

    def execute
      directories = Dir.glob(File.join(@settings.data_dir, '*'))
      directories.select! { |f| File.directory?(f) }
      directories.sort!

      directories.each do |directory|
        puts "indexing files in #{directory}"
        category = index_category(directory)
        index_emails(directory, category[:name])
      end

      puts '', 'indexing replies and users'
      if @settings.group_messages_by_subject
        @database.sort_emails_by_subject
        @database.update_in_reply_to_by_email_subject
      else
        @database.update_in_reply_to_of_emails
        @database.sort_emails_by_date_and_reply_level
      end

      @database.fill_users_from_emails
    end

    private

    METADATA_FILENAME = 'metadata.yml'.freeze
    IGNORED_FILE_EXTENSIONS = ['.dbindex', '.dbnames', '.digest', '.subjects', '.yml']

    def index_category(directory)
      metadata_file = File.join(directory, METADATA_FILENAME)

      if File.exist?(metadata_file)
        # workaround for YML files that contain classname in file header
        yaml = File.read(metadata_file).sub(/^--- !.*$/, '---')
        metadata = YAML.load(yaml)
      else
        metadata = {}
      end

      category = {
        name: metadata['name'].presence || File.basename(directory),
        description: metadata['description']
      }

      @database.insert_category(category)
      category
    end

    def index_emails(directory, category_name)
      all_messages(directory, category_name) do |receiver, filename, opts|
        begin
          msg_id = receiver.message_id
          parsed_email = receiver.mail
          from_email, from_display_name = receiver.parse_from_field(parsed_email)
          body, elided, format = receiver.select_body
          reply_message_ids = extract_reply_message_ids(parsed_email)

          email = {
            msg_id: msg_id,
            from_email: from_email,
            from_name: from_display_name,
            subject: extract_subject(receiver, category_name),
            email_date: timestamp(parsed_email.date),
            raw_message: receiver.raw_email,
            body: body,
            elided: elided,
            format: format,
            attachment_count: receiver.attachments.count,
            charset: parsed_email.charset&.downcase,
            category: category_name,
            filename: File.basename(filename),
            first_line_number: opts[:first_line_number],
            last_line_number: opts[:last_line_number],
            index_duration: (monotonic_time - opts[:start_time]).round(4)
          }

          @database.transaction do |db|
            db.insert_email(email)
            db.insert_replies(msg_id, reply_message_ids) unless reply_message_ids.empty?
          end
        rescue StandardError => e
          if opts[:first_line_number] && opts[:last_line_number]
            STDERR.puts "Failed to index message in #{filename} at lines #{opts[:first_line_number]}-#{opts[:last_line_number]}"
          else
            STDERR.puts "Failed to index message in #{filename}"
          end

          STDERR.puts e.message
          STDERR.puts e.backtrace.inspect
        end
      end
    end

    def imported_file_checksums(category_name)
      rows = @database.fetch_imported_files(category_name)
      rows.each_with_object({}) do |row, hash|
        filename = File.basename(row['filename'])
        hash[filename] = row['checksum']
      end
    end

    def all_messages(directory, category_name)
      checksums = imported_file_checksums(category_name)

      Dir.foreach(directory) do |filename|
        filename = File.join(directory, filename)
        next if ignored_file?(filename, checksums)

        puts "indexing #{filename}"

        if @split_regex.present?
          each_mail(filename) do |raw_message, first_line_number, last_line_number|
            opts = {
              first_line_number: first_line_number,
              last_line_number: last_line_number,
              start_time: monotonic_time
            }
            receiver = read_mail_from_string(raw_message)
            yield receiver, filename, opts if receiver.present?
          end
        else
          opts = { start_time: monotonic_time }
          receiver = read_mail_from_file(filename)
          yield receiver, filename, opts if receiver.present?
        end

        mark_as_fully_indexed(category_name, filename)
      end
    end

    def mark_as_fully_indexed(category_name, filename)
      imported_file = {
        category: category_name,
        filename: File.basename(filename),
        checksum: calc_checksum(filename)
      }

      @database.insert_imported_file(imported_file)
    end

    def each_mail(filename)
      raw_message = ''
      first_line_number = 1
      last_line_number = 0

      each_line(filename) do |line|
        line = line.scrub

        if line =~ @split_regex
          if last_line_number > 0
            yield raw_message, first_line_number, last_line_number
            raw_message = ''
            first_line_number = last_line_number + 1
          end
        else
          raw_message << line
        end

        last_line_number += 1
      end

      yield raw_message, first_line_number, last_line_number if raw_message.present?
    end

    def each_line(filename)
      raw_file = File.open(filename, 'r')
      text_file = filename.end_with?('.gz') ? Zlib::GzipReader.new(raw_file) : raw_file

      text_file.each_line do |line|
        yield line
      end
    ensure
      raw_file.close if raw_file
    end

    def read_mail_from_file(filename)
      raw_message = File.read(filename)
      read_mail_from_string(raw_message)
    end

    def read_mail_from_string(raw_message)
      Email::Receiver.new(raw_message, convert_plaintext: true, skip_trimming: false) unless raw_message.blank?
    end

    def extract_reply_message_ids(mail)
      Email::Receiver.extract_reply_message_ids(mail, max_message_id_count: 20)
    end

    def extract_subject(receiver, list_name)
      subject = receiver.subject
      return nil if subject.blank?

      # TODO: make the list name (or maybe multiple names) configurable
      # Strip mailing list name from subject
      subject = subject.gsub(/\[#{Regexp.escape(list_name)}\]/i, '').strip

      clean_subject(subject)
    end

    # TODO: refactor and move prefixes to settings
    def clean_subject(subject)
      original_length = subject.length

      # Strip Reply prefix from title (Standard and localized)
      subject = subject.gsub(/^Re: */i, '')
      subject = subject.gsub(/^R: */i, '') #Italian
      subject = subject.gsub(/^RIF: */i, '') #Italian

      # Strip Forward prefix from title (Standard and localized)
      subject = subject.gsub(/^Fwd: */i, '')
      subject = subject.gsub(/^I: */i, '') #Italian

      subject.strip

      # In case of mixed localized prefixes there could be many of them
      # if the mail client didn't strip the localized ones
      if original_length > subject.length
        clean_subject(subject)
      else
        subject
      end
    end

    def ignored_file?(path, checksums)
      filename = File.basename(path)

      filename.start_with?('.') ||
        filename == METADATA_FILENAME ||
        IGNORED_FILE_EXTENSIONS.include?(File.extname(filename)) ||
        fully_indexed?(path, filename, checksums)
    end

    def fully_indexed?(path, filename, checksums)
      checksum = checksums[filename]
      checksum.present? && calc_checksum(path) == checksum
    end

    def calc_checksum(filename)
      Digest::SHA256.file(filename).hexdigest
    end

    def monotonic_time
      Process.clock_gettime(Process::CLOCK_MONOTONIC)
    end

    def timestamp(datetime)
      Time.zone.at(datetime).to_i if datetime
    end
  end
end
Add new, experimental version of mbox importer 2017-05-26 16:26:18 -04:00			`require_relative 'database'`
			`require 'json'`
			`require 'yaml'`

			`module ImportScripts::Mbox`
			`class Indexer`
			`# @param database [ImportScripts::Mbox::Database]`
			`# @param settings [ImportScripts::Mbox::Settings]`
			`def initialize(database, settings)`
			`@database = database`
Improvements to mbox importer * store time it took to index message in DB (to find performance issues) * ignore listserv specific files * better examples for split_regex * first email in mbox shouldn't contain the split string * always lock the DB in exclusive mode * save email within transaction * messages can be grouped by subject and use original order (for Listserv) * adds option to index emails without running the import 2018-01-17 06:03:57 -05:00			`@settings = settings`
Add new, experimental version of mbox importer 2017-05-26 16:26:18 -04:00			`@split_regex = settings.split_regex`
			`end`

			`def execute`
Improvements to mbox importer * store time it took to index message in DB (to find performance issues) * ignore listserv specific files * better examples for split_regex * first email in mbox shouldn't contain the split string * always lock the DB in exclusive mode * save email within transaction * messages can be grouped by subject and use original order (for Listserv) * adds option to index emails without running the import 2018-01-17 06:03:57 -05:00			`directories = Dir.glob(File.join(@settings.data_dir, '*'))`
Add new, experimental version of mbox importer 2017-05-26 16:26:18 -04:00			`directories.select! { \|f\| File.directory?(f) }`
			`directories.sort!`

			`directories.each do \|directory\|`
			`puts "indexing files in #{directory}"`
			`category = index_category(directory)`
			`index_emails(directory, category[:name])`
			`end`

			`puts '', 'indexing replies and users'`
Improvements to mbox importer * store time it took to index message in DB (to find performance issues) * ignore listserv specific files * better examples for split_regex * first email in mbox shouldn't contain the split string * always lock the DB in exclusive mode * save email within transaction * messages can be grouped by subject and use original order (for Listserv) * adds option to index emails without running the import 2018-01-17 06:03:57 -05:00			`if @settings.group_messages_by_subject`
			`@database.sort_emails_by_subject`
			`@database.update_in_reply_to_by_email_subject`
			`else`
			`@database.update_in_reply_to_of_emails`
			`@database.sort_emails_by_date_and_reply_level`
			`end`

Add new, experimental version of mbox importer 2017-05-26 16:26:18 -04:00			`@database.fill_users_from_emails`
			`end`

			`private`

			`METADATA_FILENAME = 'metadata.yml'.freeze`
Improve Google Groups scraper * Better error detection during login phase * Experimental support for 2FA and SMS codes * Detect missing permissions to scrape email addresses 2019-03-24 18:08:03 -04:00			`IGNORED_FILE_EXTENSIONS = ['.dbindex', '.dbnames', '.digest', '.subjects', '.yml']`
Add new, experimental version of mbox importer 2017-05-26 16:26:18 -04:00
			`def index_category(directory)`
			`metadata_file = File.join(directory, METADATA_FILENAME)`

			`if File.exist?(metadata_file)`
			`# workaround for YML files that contain classname in file header`
			`yaml = File.read(metadata_file).sub(/^--- !.*$/, '---')`
			`metadata = YAML.load(yaml)`
			`else`
			`metadata = {}`
			`end`

			`category = {`
			`name: metadata['name'].presence \|\| File.basename(directory),`
			`description: metadata['description']`
			`}`

			`@database.insert_category(category)`
			`category`
			`end`

			`def index_emails(directory, category_name)`
Improvements to mbox importer * store time it took to index message in DB (to find performance issues) * ignore listserv specific files * better examples for split_regex * first email in mbox shouldn't contain the split string * always lock the DB in exclusive mode * save email within transaction * messages can be grouped by subject and use original order (for Listserv) * adds option to index emails without running the import 2018-01-17 06:03:57 -05:00			`all_messages(directory, category_name) do \|receiver, filename, opts\|`
Improvements to mbox import script * Ignore errors during indexing and show information about the message causing the problem * Always activate imported users if they aren't staged 2018-03-06 05:32:12 -05:00			`begin`
			`msg_id = receiver.message_id`
			`parsed_email = receiver.mail`
			`from_email, from_display_name = receiver.parse_from_field(parsed_email)`
			`body, elided, format = receiver.select_body`
			`reply_message_ids = extract_reply_message_ids(parsed_email)`

			`email = {`
			`msg_id: msg_id,`
			`from_email: from_email,`
			`from_name: from_display_name,`
			`subject: extract_subject(receiver, category_name),`
Improve mbox import script * emails weren't sorted in correct order * better default regex for splitting mbox files * output Message-ID if email is skipped because it doesn't have a Date 2018-08-23 03:46:25 -04:00			`email_date: timestamp(parsed_email.date),`
Improvements to mbox import script * Ignore errors during indexing and show information about the message causing the problem * Always activate imported users if they aren't staged 2018-03-06 05:32:12 -05:00			`raw_message: receiver.raw_email,`
			`body: body,`
			`elided: elided,`
			`format: format,`
			`attachment_count: receiver.attachments.count,`
			`charset: parsed_email.charset&.downcase,`
			`category: category_name,`
			`filename: File.basename(filename),`
			`first_line_number: opts[:first_line_number],`
			`last_line_number: opts[:last_line_number],`
			`index_duration: (monotonic_time - opts[:start_time]).round(4)`
			`}`

			`@database.transaction do \|db\|`
			`db.insert_email(email)`
			`db.insert_replies(msg_id, reply_message_ids) unless reply_message_ids.empty?`
			`end`
			`rescue StandardError => e`
			`if opts[:first_line_number] && opts[:last_line_number]`
			`STDERR.puts "Failed to index message in #{filename} at lines #{opts[:first_line_number]}-#{opts[:last_line_number]}"`
			`else`
			`STDERR.puts "Failed to index message in #{filename}"`
			`end`

			`STDERR.puts e.message`
			`STDERR.puts e.backtrace.inspect`
Improvements to mbox importer * store time it took to index message in DB (to find performance issues) * ignore listserv specific files * better examples for split_regex * first email in mbox shouldn't contain the split string * always lock the DB in exclusive mode * save email within transaction * messages can be grouped by subject and use original order (for Listserv) * adds option to index emails without running the import 2018-01-17 06:03:57 -05:00			`end`
Add new, experimental version of mbox importer 2017-05-26 16:26:18 -04:00			`end`
			`end`

			`def imported_file_checksums(category_name)`
			`rows = @database.fetch_imported_files(category_name)`
			`rows.each_with_object({}) do \|row, hash\|`
FIX: mbox importer didn't detected already indexed files 2018-01-17 11:03:36 -05:00			`filename = File.basename(row['filename'])`
			`hash[filename] = row['checksum']`
Add new, experimental version of mbox importer 2017-05-26 16:26:18 -04:00			`end`
			`end`

			`def all_messages(directory, category_name)`
			`checksums = imported_file_checksums(category_name)`

			`Dir.foreach(directory) do \|filename\|`
			`filename = File.join(directory, filename)`
			`next if ignored_file?(filename, checksums)`

			`puts "indexing #{filename}"`

			`if @split_regex.present?`
			`each_mail(filename) do \|raw_message, first_line_number, last_line_number\|`
Improvements to mbox importer * store time it took to index message in DB (to find performance issues) * ignore listserv specific files * better examples for split_regex * first email in mbox shouldn't contain the split string * always lock the DB in exclusive mode * save email within transaction * messages can be grouped by subject and use original order (for Listserv) * adds option to index emails without running the import 2018-01-17 06:03:57 -05:00			`opts = {`
			`first_line_number: first_line_number,`
			`last_line_number: last_line_number,`
			`start_time: monotonic_time`
			`}`
improvements to the mbox import script * ignores dot-files and empty emails * new setting to prefer HTML over plaintext emails during import * restore original site settings at the end of import * elided content of HTML mails was not put inside details block 2017-11-18 07:53:21 -05:00			`receiver = read_mail_from_string(raw_message)`
Improvements to mbox importer * store time it took to index message in DB (to find performance issues) * ignore listserv specific files * better examples for split_regex * first email in mbox shouldn't contain the split string * always lock the DB in exclusive mode * save email within transaction * messages can be grouped by subject and use original order (for Listserv) * adds option to index emails without running the import 2018-01-17 06:03:57 -05:00			`yield receiver, filename, opts if receiver.present?`
Add new, experimental version of mbox importer 2017-05-26 16:26:18 -04:00			`end`
			`else`
Improvements to mbox importer * store time it took to index message in DB (to find performance issues) * ignore listserv specific files * better examples for split_regex * first email in mbox shouldn't contain the split string * always lock the DB in exclusive mode * save email within transaction * messages can be grouped by subject and use original order (for Listserv) * adds option to index emails without running the import 2018-01-17 06:03:57 -05:00			`opts = { start_time: monotonic_time }`
improvements to the mbox import script * ignores dot-files and empty emails * new setting to prefer HTML over plaintext emails during import * restore original site settings at the end of import * elided content of HTML mails was not put inside details block 2017-11-18 07:53:21 -05:00			`receiver = read_mail_from_file(filename)`
Improvements to mbox importer * store time it took to index message in DB (to find performance issues) * ignore listserv specific files * better examples for split_regex * first email in mbox shouldn't contain the split string * always lock the DB in exclusive mode * save email within transaction * messages can be grouped by subject and use original order (for Listserv) * adds option to index emails without running the import 2018-01-17 06:03:57 -05:00			`yield receiver, filename, opts if receiver.present?`
Add new, experimental version of mbox importer 2017-05-26 16:26:18 -04:00			`end`

			`mark_as_fully_indexed(category_name, filename)`
			`end`
			`end`

			`def mark_as_fully_indexed(category_name, filename)`
			`imported_file = {`
			`category: category_name,`
FIX: mbox importer didn't detected already indexed files 2018-01-17 11:03:36 -05:00			`filename: File.basename(filename),`
Add new, experimental version of mbox importer 2017-05-26 16:26:18 -04:00			`checksum: calc_checksum(filename)`
			`}`

			`@database.insert_imported_file(imported_file)`
			`end`

			`def each_mail(filename)`
			`raw_message = ''`
			`first_line_number = 1`
			`last_line_number = 0`

			`each_line(filename) do \|line\|`
			`line = line.scrub`

Improvements to mbox importer * store time it took to index message in DB (to find performance issues) * ignore listserv specific files * better examples for split_regex * first email in mbox shouldn't contain the split string * always lock the DB in exclusive mode * save email within transaction * messages can be grouped by subject and use original order (for Listserv) * adds option to index emails without running the import 2018-01-17 06:03:57 -05:00			`if line =~ @split_regex`
			`if last_line_number > 0`
			`yield raw_message, first_line_number, last_line_number`
			`raw_message = ''`
			`first_line_number = last_line_number + 1`
			`end`
Add new, experimental version of mbox importer 2017-05-26 16:26:18 -04:00			`else`
			`raw_message << line`
			`end`

			`last_line_number += 1`
			`end`

			`yield raw_message, first_line_number, last_line_number if raw_message.present?`
			`end`

			`def each_line(filename)`
			`raw_file = File.open(filename, 'r')`
			`text_file = filename.end_with?('.gz') ? Zlib::GzipReader.new(raw_file) : raw_file`

			`text_file.each_line do \|line\|`
			`yield line`
			`end`
			`ensure`
			`raw_file.close if raw_file`
			`end`

			`def read_mail_from_file(filename)`
			`raw_message = File.read(filename)`
			`read_mail_from_string(raw_message)`
			`end`

			`def read_mail_from_string(raw_message)`
Improvements to mbox importer * store time it took to index message in DB (to find performance issues) * ignore listserv specific files * better examples for split_regex * first email in mbox shouldn't contain the split string * always lock the DB in exclusive mode * save email within transaction * messages can be grouped by subject and use original order (for Listserv) * adds option to index emails without running the import 2018-01-17 06:03:57 -05:00			`Email::Receiver.new(raw_message, convert_plaintext: true, skip_trimming: false) unless raw_message.blank?`
Add new, experimental version of mbox importer 2017-05-26 16:26:18 -04:00			`end`

			`def extract_reply_message_ids(mail)`
FEATURE: Use Message-ID for detecting email replies to group Ignores the site setting "find_related_post_with_key" and always tries to honor the `In-Reply-To` and `References` header for emails sent to a group. The senders email address must be included in the `To` or `CC` header of a previous email sent to the group and the `Message-ID` of that email must be included in the current email's `In-Reply-To` or `References` header. 2018-03-30 08:37:19 -04:00			`Email::Receiver.extract_reply_message_ids(mail, max_message_id_count: 20)`
Add new, experimental version of mbox importer 2017-05-26 16:26:18 -04:00			`end`

			`def extract_subject(receiver, list_name)`
			`subject = receiver.subject`
			`return nil if subject.blank?`

			`# TODO: make the list name (or maybe multiple names) configurable`
			`# Strip mailing list name from subject`
Ignore case when removing mailing list name from subject 2018-02-12 15:41:58 -05:00			`subject = subject.gsub(/\[#{Regexp.escape(list_name)}\]/i, '').strip`
Add new, experimental version of mbox importer 2017-05-26 16:26:18 -04:00
			`clean_subject(subject)`
			`end`

			`# TODO: refactor and move prefixes to settings`
			`def clean_subject(subject)`
			`original_length = subject.length`

			`# Strip Reply prefix from title (Standard and localized)`
			`subject = subject.gsub(/^Re: */i, '')`
			`subject = subject.gsub(/^R: */i, '') #Italian`
			`subject = subject.gsub(/^RIF: */i, '') #Italian`

			`# Strip Forward prefix from title (Standard and localized)`
			`subject = subject.gsub(/^Fwd: */i, '')`
			`subject = subject.gsub(/^I: */i, '') #Italian`

			`subject.strip`

			`# In case of mixed localized prefixes there could be many of them`
			`# if the mail client didn't strip the localized ones`
			`if original_length > subject.length`
			`clean_subject(subject)`
			`else`
			`subject`
			`end`
			`end`

FIX: mbox importer didn't detected already indexed files 2018-01-17 11:03:36 -05:00			`def ignored_file?(path, checksums)`
			`filename = File.basename(path)`
Add new, experimental version of mbox importer 2017-05-26 16:26:18 -04:00
Improvements to mbox importer * store time it took to index message in DB (to find performance issues) * ignore listserv specific files * better examples for split_regex * first email in mbox shouldn't contain the split string * always lock the DB in exclusive mode * save email within transaction * messages can be grouped by subject and use original order (for Listserv) * adds option to index emails without running the import 2018-01-17 06:03:57 -05:00			`filename.start_with?('.') \|\|`
			`filename == METADATA_FILENAME \|\|`
			`IGNORED_FILE_EXTENSIONS.include?(File.extname(filename)) \|\|`
FIX: mbox importer didn't detected already indexed files 2018-01-17 11:03:36 -05:00			`fully_indexed?(path, filename, checksums)`
Add new, experimental version of mbox importer 2017-05-26 16:26:18 -04:00			`end`

FIX: mbox importer didn't detected already indexed files 2018-01-17 11:03:36 -05:00			`def fully_indexed?(path, filename, checksums)`
Add new, experimental version of mbox importer 2017-05-26 16:26:18 -04:00			`checksum = checksums[filename]`
FIX: mbox importer didn't detected already indexed files 2018-01-17 11:03:36 -05:00			`checksum.present? && calc_checksum(path) == checksum`
Add new, experimental version of mbox importer 2017-05-26 16:26:18 -04:00			`end`

			`def calc_checksum(filename)`
			`Digest::SHA256.file(filename).hexdigest`
			`end`
Improvements to mbox importer * store time it took to index message in DB (to find performance issues) * ignore listserv specific files * better examples for split_regex * first email in mbox shouldn't contain the split string * always lock the DB in exclusive mode * save email within transaction * messages can be grouped by subject and use original order (for Listserv) * adds option to index emails without running the import 2018-01-17 06:03:57 -05:00
			`def monotonic_time`
			`Process.clock_gettime(Process::CLOCK_MONOTONIC)`
			`end`
Improve mbox import script * emails weren't sorted in correct order * better default regex for splitting mbox files * output Message-ID if email is skipped because it doesn't have a Date 2018-08-23 03:46:25 -04:00
			`def timestamp(datetime)`
			`Time.zone.at(datetime).to_i if datetime`
			`end`
Add new, experimental version of mbox importer 2017-05-26 16:26:18 -04:00			`end`
			`end`