discourse-ai/lib/sentiment/post_classification.rb

# frozen_string_literal: true

module DiscourseAi
  module Sentiment
    class PostClassification
      def self.backfill_query(from_post_id: nil, max_age_days: nil)
        available_classifier_names =
          DiscourseAi::Sentiment::SentimentSiteSettingJsonSchema.values.map { _1.model_name }

        queries =
          available_classifier_names.map do |classifier_name|
            base_query =
              Post
                .includes(:sentiment_classifications)
                .joins("INNER JOIN topics ON topics.id = posts.topic_id")
                .where(post_type: Post.types[:regular])
                .where.not(topics: { archetype: Archetype.private_message })
                .where(posts: { deleted_at: nil })
                .where(topics: { deleted_at: nil })
                .joins(<<~SQL)
                LEFT JOIN classification_results crs
                  ON crs.target_id = posts.id
                  AND crs.target_type = 'Post'
                  AND crs.classification_type = 'sentiment'
                  AND crs.model_used = '#{classifier_name}'
              SQL
                .where("crs.id IS NULL")

            base_query =
              base_query.where("posts.id >= ?", from_post_id.to_i) if from_post_id.present?

            if max_age_days.present?
              base_query =
                base_query.where(
                  "posts.created_at > current_date - INTERVAL '#{max_age_days.to_i} DAY'",
                )
            end

            base_query
          end

        unioned_queries = queries.map(&:to_sql).join(" UNION ")

        Post.from(Arel.sql("(#{unioned_queries}) as posts"))
      end

      def bulk_classify!(relation)
        http_pool_size = 100
        pool =
          Concurrent::CachedThreadPool.new(
            min_threads: 0,
            max_threads: http_pool_size,
            idletime: 30,
          )

        available_classifiers = classifiers
        return if available_classifiers.blank?
        base_url = Discourse.base_url

        promised_classifications =
          relation
            .map do |record|
              text = prepare_text(record)
              next if text.blank?

              Concurrent::Promises
                .fulfilled_future({ target: record, text: text }, pool)
                .then_on(pool) do |w_text|
                  results = Concurrent::Hash.new
                  already_classified = w_text[:target].sentiment_classifications.map(&:model_used)

                  classifiers_for_target =
                    available_classifiers.reject { |ac| already_classified.include?(ac.model_name) }

                  promised_target_results =
                    classifiers_for_target.map do |c|
                      Concurrent::Promises.future_on(pool) do
                        results[c.model_name] = request_with(w_text[:text], c, base_url)
                      end
                    end

                  Concurrent::Promises
                    .zip(*promised_target_results)
                    .then_on(pool) { |_| w_text.merge(classification: results) }
                end
                .flat(1)
            end
            .compact

        Concurrent::Promises
          .zip(*promised_classifications)
          .value!
          .each { |r| store_classification(r[:target], r[:classification]) }

        pool.shutdown
        pool.wait_for_termination
      end

      def classify!(target)
        return if target.blank?
        return if classifiers.blank?

        to_classify = prepare_text(target)
        return if to_classify.blank?

        already_classified = target.sentiment_classifications.map(&:model_used)
        classifiers_for_target =
          classifiers.reject { |ac| already_classified.include?(ac.model_name) }

        results =
          classifiers_for_target.reduce({}) do |memo, model|
            memo[model.model_name] = request_with(to_classify, model)
            memo
          end

        store_classification(target, results)
      end

      def classifiers
        DiscourseAi::Sentiment::SentimentSiteSettingJsonSchema.values
      end

      def has_classifiers?
        classifiers.present?
      end

      private

      def prepare_text(target)
        content =
          if target.post_number == 1
            "#{target.topic.title}\n#{target.raw}"
          else
            target.raw
          end

        Tokenizer::BertTokenizer.truncate(content, 512)
      end

      def request_with(content, config, base_url = Discourse.base_url)
        result =
          DiscourseAi::Inference::HuggingFaceTextEmbeddings.classify(content, config, base_url)
        transform_result(result)
      end

      def transform_result(result)
        hash_result = {}
        result.each { |r| hash_result[r[:label]] = r[:score] }
        hash_result
      end

      def store_classification(target, classification)
        attrs =
          classification.map do |model_name, classifications|
            {
              model_used: model_name,
              target_id: target.id,
              target_type: target.class.sti_name,
              classification_type: :sentiment,
              classification: classifications,
              updated_at: DateTime.now,
              created_at: DateTime.now,
            }
          end

        ClassificationResult.upsert_all(
          attrs,
          unique_by: %i[target_id target_type model_used],
          update_only: %i[classification],
        )
      end
    end
  end
end
REFACTOR: Simplify sentiment classification (#977) This change adds a simpler class for sentiment classification, replacing the soon-to-be removed `Classificator` hierarchy. Additionally, it adds a method for classifying concurrently, speeding up the backfill rake task. 2024-11-28 13:38:23 -05:00			`# frozen_string_literal: true`

			`module DiscourseAi`
			`module Sentiment`
			`class PostClassification`
FEATURE: Backfill posts sentiment. (#982) * FEATURE: Backfill posts sentiment. It adds a scheduled job to backfill posts' sentiment, similar to our existing rake task, but with two settings to control the batch size and posts' max-age. * Make sure model_name order is consistent. 2024-12-03 08:27:03 -05:00			`def self.backfill_query(from_post_id: nil, max_age_days: nil)`
			`available_classifier_names =`
FIX: More resilient sentiment backfill query (#998) 2024-12-04 10:10:31 -05:00			`DiscourseAi::Sentiment::SentimentSiteSettingJsonSchema.values.map { _1.model_name }`
FEATURE: Backfill posts sentiment. (#982) * FEATURE: Backfill posts sentiment. It adds a scheduled job to backfill posts' sentiment, similar to our existing rake task, but with two settings to control the batch size and posts' max-age. * Make sure model_name order is consistent. 2024-12-03 08:27:03 -05:00
FIX: More resilient sentiment backfill query (#998) 2024-12-04 10:10:31 -05:00			`queries =`
			`available_classifier_names.map do \|classifier_name\|`
			`base_query =`
			`Post`
			`.includes(:sentiment_classifications)`
			`.joins("INNER JOIN topics ON topics.id = posts.topic_id")`
			`.where(post_type: Post.types[:regular])`
			`.where.not(topics: { archetype: Archetype.private_message })`
			`.where(posts: { deleted_at: nil })`
			`.where(topics: { deleted_at: nil })`
			`.joins(<<~SQL)`
			`LEFT JOIN classification_results crs`
			`ON crs.target_id = posts.id`
			`AND crs.target_type = 'Post'`
			`AND crs.classification_type = 'sentiment'`
			`AND crs.model_used = '#{classifier_name}'`
			`SQL`
			`.where("crs.id IS NULL")`
FEATURE: Backfill posts sentiment. (#982) * FEATURE: Backfill posts sentiment. It adds a scheduled job to backfill posts' sentiment, similar to our existing rake task, but with two settings to control the batch size and posts' max-age. * Make sure model_name order is consistent. 2024-12-03 08:27:03 -05:00
FIX: More resilient sentiment backfill query (#998) 2024-12-04 10:10:31 -05:00			`base_query =`
			`base_query.where("posts.id >= ?", from_post_id.to_i) if from_post_id.present?`
FEATURE: Backfill posts sentiment. (#982) * FEATURE: Backfill posts sentiment. It adds a scheduled job to backfill posts' sentiment, similar to our existing rake task, but with two settings to control the batch size and posts' max-age. * Make sure model_name order is consistent. 2024-12-03 08:27:03 -05:00
FIX: More resilient sentiment backfill query (#998) 2024-12-04 10:10:31 -05:00			`if max_age_days.present?`
			`base_query =`
			`base_query.where(`
			`"posts.created_at > current_date - INTERVAL '#{max_age_days.to_i} DAY'",`
			`)`
			`end`
FEATURE: Backfill posts sentiment. (#982) * FEATURE: Backfill posts sentiment. It adds a scheduled job to backfill posts' sentiment, similar to our existing rake task, but with two settings to control the batch size and posts' max-age. * Make sure model_name order is consistent. 2024-12-03 08:27:03 -05:00
FIX: More resilient sentiment backfill query (#998) 2024-12-04 10:10:31 -05:00			`base_query`
			`end`

			`unioned_queries = queries.map(&:to_sql).join(" UNION ")`

			`Post.from(Arel.sql("(#{unioned_queries}) as posts"))`
FEATURE: Backfill posts sentiment. (#982) * FEATURE: Backfill posts sentiment. It adds a scheduled job to backfill posts' sentiment, similar to our existing rake task, but with two settings to control the batch size and posts' max-age. * Make sure model_name order is consistent. 2024-12-03 08:27:03 -05:00			`end`

REFACTOR: Simplify sentiment classification (#977) This change adds a simpler class for sentiment classification, replacing the soon-to-be removed `Classificator` hierarchy. Additionally, it adds a method for classifying concurrently, speeding up the backfill rake task. 2024-11-28 13:38:23 -05:00			`def bulk_classify!(relation)`
			`http_pool_size = 100`
			`pool =`
			`Concurrent::CachedThreadPool.new(`
			`min_threads: 0,`
			`max_threads: http_pool_size,`
			`idletime: 30,`
			`)`

			`available_classifiers = classifiers`
FEATURE: Backfill posts sentiment. (#982) * FEATURE: Backfill posts sentiment. It adds a scheduled job to backfill posts' sentiment, similar to our existing rake task, but with two settings to control the batch size and posts' max-age. * Make sure model_name order is consistent. 2024-12-03 08:27:03 -05:00			`return if available_classifiers.blank?`
REFACTOR: Simplify sentiment classification (#977) This change adds a simpler class for sentiment classification, replacing the soon-to-be removed `Classificator` hierarchy. Additionally, it adds a method for classifying concurrently, speeding up the backfill rake task. 2024-11-28 13:38:23 -05:00			`base_url = Discourse.base_url`

			`promised_classifications =`
			`relation`
			`.map do \|record\|`
			`text = prepare_text(record)`
			`next if text.blank?`

			`Concurrent::Promises`
			`.fulfilled_future({ target: record, text: text }, pool)`
			`.then_on(pool) do \|w_text\|`
			`results = Concurrent::Hash.new`
FEATURE: Backfill posts sentiment. (#982) * FEATURE: Backfill posts sentiment. It adds a scheduled job to backfill posts' sentiment, similar to our existing rake task, but with two settings to control the batch size and posts' max-age. * Make sure model_name order is consistent. 2024-12-03 08:27:03 -05:00			`already_classified = w_text[:target].sentiment_classifications.map(&:model_used)`

			`classifiers_for_target =`
			`available_classifiers.reject { \|ac\| already_classified.include?(ac.model_name) }`
REFACTOR: Simplify sentiment classification (#977) This change adds a simpler class for sentiment classification, replacing the soon-to-be removed `Classificator` hierarchy. Additionally, it adds a method for classifying concurrently, speeding up the backfill rake task. 2024-11-28 13:38:23 -05:00
			`promised_target_results =`
FEATURE: Backfill posts sentiment. (#982) * FEATURE: Backfill posts sentiment. It adds a scheduled job to backfill posts' sentiment, similar to our existing rake task, but with two settings to control the batch size and posts' max-age. * Make sure model_name order is consistent. 2024-12-03 08:27:03 -05:00			`classifiers_for_target.map do \|c\|`
REFACTOR: Simplify sentiment classification (#977) This change adds a simpler class for sentiment classification, replacing the soon-to-be removed `Classificator` hierarchy. Additionally, it adds a method for classifying concurrently, speeding up the backfill rake task. 2024-11-28 13:38:23 -05:00			`Concurrent::Promises.future_on(pool) do`
			`results[c.model_name] = request_with(w_text[:text], c, base_url)`
			`end`
			`end`

			`Concurrent::Promises`
			`.zip(*promised_target_results)`
			`.then_on(pool) { \|_\| w_text.merge(classification: results) }`
			`end`
			`.flat(1)`
			`end`
			`.compact`

			`Concurrent::Promises`
			`.zip(*promised_classifications)`
			`.value!`
			`.each { \|r\| store_classification(r[:target], r[:classification]) }`

			`pool.shutdown`
			`pool.wait_for_termination`
			`end`

			`def classify!(target)`
			`return if target.blank?`
FEATURE: Backfill posts sentiment. (#982) * FEATURE: Backfill posts sentiment. It adds a scheduled job to backfill posts' sentiment, similar to our existing rake task, but with two settings to control the batch size and posts' max-age. * Make sure model_name order is consistent. 2024-12-03 08:27:03 -05:00			`return if classifiers.blank?`
REFACTOR: Simplify sentiment classification (#977) This change adds a simpler class for sentiment classification, replacing the soon-to-be removed `Classificator` hierarchy. Additionally, it adds a method for classifying concurrently, speeding up the backfill rake task. 2024-11-28 13:38:23 -05:00
			`to_classify = prepare_text(target)`
			`return if to_classify.blank?`

FEATURE: Backfill posts sentiment. (#982) * FEATURE: Backfill posts sentiment. It adds a scheduled job to backfill posts' sentiment, similar to our existing rake task, but with two settings to control the batch size and posts' max-age. * Make sure model_name order is consistent. 2024-12-03 08:27:03 -05:00			`already_classified = target.sentiment_classifications.map(&:model_used)`
			`classifiers_for_target =`
			`classifiers.reject { \|ac\| already_classified.include?(ac.model_name) }`

REFACTOR: Simplify sentiment classification (#977) This change adds a simpler class for sentiment classification, replacing the soon-to-be removed `Classificator` hierarchy. Additionally, it adds a method for classifying concurrently, speeding up the backfill rake task. 2024-11-28 13:38:23 -05:00			`results =`
FEATURE: Backfill posts sentiment. (#982) * FEATURE: Backfill posts sentiment. It adds a scheduled job to backfill posts' sentiment, similar to our existing rake task, but with two settings to control the batch size and posts' max-age. * Make sure model_name order is consistent. 2024-12-03 08:27:03 -05:00			`classifiers_for_target.reduce({}) do \|memo, model\|`
REFACTOR: Simplify sentiment classification (#977) This change adds a simpler class for sentiment classification, replacing the soon-to-be removed `Classificator` hierarchy. Additionally, it adds a method for classifying concurrently, speeding up the backfill rake task. 2024-11-28 13:38:23 -05:00			`memo[model.model_name] = request_with(to_classify, model)`
			`memo`
			`end`

			`store_classification(target, results)`
			`end`

DEV: Cleanup deprecations (#952) 2024-12-02 12:18:03 -05:00			`def classifiers`
			`DiscourseAi::Sentiment::SentimentSiteSettingJsonSchema.values`
			`end`

FEATURE: Backfill posts sentiment. (#982) * FEATURE: Backfill posts sentiment. It adds a scheduled job to backfill posts' sentiment, similar to our existing rake task, but with two settings to control the batch size and posts' max-age. * Make sure model_name order is consistent. 2024-12-03 08:27:03 -05:00			`def has_classifiers?`
			`classifiers.present?`
			`end`

REFACTOR: Simplify sentiment classification (#977) This change adds a simpler class for sentiment classification, replacing the soon-to-be removed `Classificator` hierarchy. Additionally, it adds a method for classifying concurrently, speeding up the backfill rake task. 2024-11-28 13:38:23 -05:00			`private`

			`def prepare_text(target)`
			`content =`
			`if target.post_number == 1`
			`"#{target.topic.title}\n#{target.raw}"`
			`else`
			`target.raw`
			`end`

			`Tokenizer::BertTokenizer.truncate(content, 512)`
			`end`

			`def request_with(content, config, base_url = Discourse.base_url)`
FIX: Sentiment classification results needs to be transformed before saving (#983) 2024-11-29 15:31:56 -05:00			`result =`
			`DiscourseAi::Inference::HuggingFaceTextEmbeddings.classify(content, config, base_url)`
			`transform_result(result)`
			`end`

			`def transform_result(result)`
			`hash_result = {}`
			`result.each { \|r\| hash_result[r[:label]] = r[:score] }`
			`hash_result`
REFACTOR: Simplify sentiment classification (#977) This change adds a simpler class for sentiment classification, replacing the soon-to-be removed `Classificator` hierarchy. Additionally, it adds a method for classifying concurrently, speeding up the backfill rake task. 2024-11-28 13:38:23 -05:00			`end`

			`def store_classification(target, classification)`
			`attrs =`
			`classification.map do \|model_name, classifications\|`
			`{`
			`model_used: model_name,`
			`target_id: target.id,`
			`target_type: target.class.sti_name,`
			`classification_type: :sentiment,`
			`classification: classifications,`
			`updated_at: DateTime.now,`
			`created_at: DateTime.now,`
			`}`
			`end`

			`ClassificationResult.upsert_all(`
			`attrs,`
			`unique_by: %i[target_id target_type model_used],`
			`update_only: %i[classification],`
			`)`
			`end`
			`end`
			`end`
			`end`