discourse-ai/spec/jobs/regular/digest_rag_upload_spec.rb

# frozen_string_literal: true

RSpec.describe Jobs::DigestRagUpload do
  fab!(:persona) { Fabricate(:ai_persona) }
  fab!(:upload) { Fabricate(:upload, extension: "txt") }
  fab!(:image_upload) { Fabricate(:upload, extension: "png") }
  let(:document_file) { StringIO.new("some text" * 200) }

  fab!(:cloudflare_embedding_def)
  let(:expected_embedding) { [0.0038493] * cloudflare_embedding_def.dimensions }

  let(:document_with_metadata) { plugin_file_from_fixtures("doc_with_metadata.txt", "rag") }

  let(:parsed_document_with_metadata) do
    plugin_file_from_fixtures("parsed_doc_with_metadata.txt", "rag")
  end

  let(:upload_with_metadata) do
    UploadCreator.new(document_with_metadata, "document.txt").create_for(Discourse.system_user.id)
  end

  before do
    SiteSetting.ai_embeddings_selected_model = cloudflare_embedding_def.id
    SiteSetting.ai_embeddings_enabled = true
    SiteSetting.authorized_extensions = "txt"

    WebMock.stub_request(:post, cloudflare_embedding_def.url).to_return(
      status: 200,
      body: JSON.dump(expected_embedding),
    )
  end

  describe "#execute" do
    context "when processing an image upload" do
      it "will reject the indexing if the site setting is not enabled" do
        SiteSetting.ai_rag_images_enabled = false

        expect {
          described_class.new.execute(
            upload_id: image_upload.id,
            target_id: persona.id,
            target_type: persona.class.to_s,
          )
        }.to raise_error(Discourse::InvalidAccess)
      end
    end
    context "when processing an upload containing metadata" do
      it "correctly splits on metadata boundary" do
        # be explicit here about chunking strategy
        persona.update!(rag_chunk_tokens: 100, rag_chunk_overlap_tokens: 10)

        described_class.new.execute(
          upload_id: upload_with_metadata.id,
          target_id: persona.id,
          target_type: persona.class.to_s,
        )

        parsed = +""
        first = true
        RagDocumentFragment
          .where(upload: upload_with_metadata)
          .order(:fragment_number)
          .each do |fragment|
            parsed << "\n\n" if !first
            parsed << "metadata: #{fragment.metadata}\n"
            parsed << "number: #{fragment.fragment_number}\n"
            parsed << fragment.fragment
            first = false
          end

        # to rebuild parsed
        #File.write("/tmp/testing", parsed)

        expect(parsed).to eq(parsed_document_with_metadata.read)
      end
    end
    context "when processing an upload for the first time" do
      before { File.expects(:open).returns(document_file) }

      it "splits an upload into chunks" do
        subject.execute(
          upload_id: upload.id,
          target_id: persona.id,
          target_type: persona.class.to_s,
        )

        created_fragment = RagDocumentFragment.last

        expect(created_fragment).to be_present
        expect(created_fragment.fragment).to be_present
        expect(created_fragment.fragment_number).to eq(2)
      end

      it "queue jobs to generate embeddings for each fragment" do
        expect {
          subject.execute(
            upload_id: upload.id,
            target_id: persona.id,
            target_type: persona.class.to_s,
          )
        }.to change(Jobs::GenerateRagEmbeddings.jobs, :size).by(1)
      end
    end

    it "doesn't generate new fragments if we already processed the upload" do
      Fabricate(:rag_document_fragment, upload: upload, target: persona)

      previous_count = RagDocumentFragment.where(upload: upload, target: persona).count

      subject.execute(upload_id: upload.id, target_id: persona.id, target_type: persona.class.to_s)
      updated_count = RagDocumentFragment.where(upload: upload, target: persona).count

      expect(updated_count).to eq(previous_count)
    end
  end
end