discourse-ai/app/models/rag_document_fragment.rb

# frozen_string_literal: true

class RagDocumentFragment < ActiveRecord::Base
  # TODO Jan 2025 - remove
  self.ignored_columns = %i[ai_persona_id]

  belongs_to :upload
  belongs_to :target, polymorphic: true

  class << self
    def link_target_and_uploads(target, upload_ids)
      return if target.blank?
      return if upload_ids.blank?
      return if !SiteSetting.ai_embeddings_enabled?

      UploadReference.ensure_exist!(upload_ids: upload_ids, target: target)

      upload_ids.each do |upload_id|
        Jobs.enqueue(
          :digest_rag_upload,
          target_id: target.id,
          target_type: target.class.to_s,
          upload_id: upload_id,
        )
      end
    end

    def update_target_uploads(target, upload_ids)
      return if target.blank?
      return if !SiteSetting.ai_embeddings_enabled?

      if upload_ids.blank?
        RagDocumentFragment.where(target: target).destroy_all
        UploadReference.where(target: target).destroy_all
      else
        RagDocumentFragment.where(target: target).where.not(upload_id: upload_ids).destroy_all
        link_target_and_uploads(target, upload_ids)
      end
    end

    def indexing_status(persona, uploads)
      truncation = DiscourseAi::Embeddings::Strategies::Truncation.new
      vector_rep =
        DiscourseAi::Embeddings::VectorRepresentations::Base.current_representation(truncation)

      embeddings_table = vector_rep.rag_fragments_table_name

      results =
        DB.query(
          <<~SQL,
        SELECT
          uploads.id,
          SUM(CASE WHEN (rdf.upload_id IS NOT NULL) THEN 1 ELSE 0 END) AS total,
          SUM(CASE WHEN (eft.rag_document_fragment_id IS NOT NULL) THEN 1 ELSE 0 END) as indexed,
          SUM(CASE WHEN (rdf.upload_id IS NOT NULL AND eft.rag_document_fragment_id IS NULL) THEN 1 ELSE 0 END) as left
        FROM uploads
        LEFT OUTER JOIN rag_document_fragments rdf ON uploads.id = rdf.upload_id AND rdf.target_id = :target_id
          AND rdf.target_type = :target_type
        LEFT OUTER JOIN #{embeddings_table} eft ON rdf.id = eft.rag_document_fragment_id
        WHERE uploads.id IN (:upload_ids)
        GROUP BY uploads.id
      SQL
          target_id: persona.id,
          target_type: persona.class.to_s,
          upload_ids: uploads.map(&:id),
        )

      results.reduce({}) do |acc, r|
        acc[r.id] = { total: r.total, indexed: r.indexed, left: r.left }
        acc
      end
    end

    def publish_status(upload, status)
      MessageBus.publish("/discourse-ai/rag/#{upload.id}", status, user_ids: [upload.user_id])
    end
  end
end

# == Schema Information
#
# Table name: rag_document_fragments
#
#  id              :bigint           not null, primary key
#  fragment        :text             not null
#  upload_id       :integer          not null
#  fragment_number :integer          not null
#  created_at      :datetime         not null
#  updated_at      :datetime         not null
#  metadata        :text
#  target_id       :bigint           not null
#  target_type     :string(800)      not null
#
# Indexes
#
#  index_rag_document_fragments_on_target_type_and_target_id  (target_type,target_id)
#