109 lines
3.5 KiB
Ruby
109 lines
3.5 KiB
Ruby
# frozen_string_literal: true
|
|
|
|
RSpec.describe Jobs::DigestRagUpload do
|
|
fab!(:persona) { Fabricate(:ai_persona) }
|
|
fab!(:upload)
|
|
|
|
let(:document_file) { StringIO.new("some text" * 200) }
|
|
|
|
let(:truncation) { DiscourseAi::Embeddings::Strategies::Truncation.new }
|
|
let(:vector_rep) do
|
|
DiscourseAi::Embeddings::VectorRepresentations::Base.current_representation(truncation)
|
|
end
|
|
|
|
let(:expected_embedding) { [0.0038493] * vector_rep.dimensions }
|
|
|
|
let(:document_with_metadata) { plugin_file_from_fixtures("doc_with_metadata.txt", "rag") }
|
|
|
|
let(:parsed_document_with_metadata) do
|
|
plugin_file_from_fixtures("parsed_doc_with_metadata.txt", "rag")
|
|
end
|
|
|
|
let(:upload_with_metadata) do
|
|
UploadCreator.new(document_with_metadata, "document.txt").create_for(Discourse.system_user.id)
|
|
end
|
|
|
|
before do
|
|
SiteSetting.ai_embeddings_enabled = true
|
|
SiteSetting.ai_embeddings_discourse_service_api_endpoint = "http://test.com"
|
|
SiteSetting.ai_embeddings_model = "bge-large-en"
|
|
SiteSetting.authorized_extensions = "txt"
|
|
|
|
WebMock.stub_request(
|
|
:post,
|
|
"#{SiteSetting.ai_embeddings_discourse_service_api_endpoint}/api/v1/classify",
|
|
).to_return(status: 200, body: JSON.dump(expected_embedding))
|
|
end
|
|
|
|
describe "#execute" do
|
|
context "when processing an upload containing metadata" do
|
|
it "correctly splits on metadata boundary" do
|
|
# be explicit here about chunking strategy
|
|
persona.update!(rag_chunk_tokens: 100, rag_chunk_overlap_tokens: 10)
|
|
|
|
described_class.new.execute(
|
|
upload_id: upload_with_metadata.id,
|
|
target_id: persona.id,
|
|
target_type: persona.class.to_s,
|
|
)
|
|
|
|
parsed = +""
|
|
first = true
|
|
RagDocumentFragment
|
|
.where(upload: upload_with_metadata)
|
|
.order(:fragment_number)
|
|
.each do |fragment|
|
|
parsed << "\n\n" if !first
|
|
parsed << "metadata: #{fragment.metadata}\n"
|
|
parsed << "number: #{fragment.fragment_number}\n"
|
|
parsed << fragment.fragment
|
|
first = false
|
|
end
|
|
|
|
# to rebuild parsed
|
|
#File.write("/tmp/testing", parsed)
|
|
|
|
expect(parsed).to eq(parsed_document_with_metadata.read)
|
|
end
|
|
end
|
|
context "when processing an upload for the first time" do
|
|
before { File.expects(:open).returns(document_file) }
|
|
|
|
it "splits an upload into chunks" do
|
|
subject.execute(
|
|
upload_id: upload.id,
|
|
target_id: persona.id,
|
|
target_type: persona.class.to_s,
|
|
)
|
|
|
|
created_fragment = RagDocumentFragment.last
|
|
|
|
expect(created_fragment).to be_present
|
|
expect(created_fragment.fragment).to be_present
|
|
expect(created_fragment.fragment_number).to eq(2)
|
|
end
|
|
|
|
it "queue jobs to generate embeddings for each fragment" do
|
|
expect {
|
|
subject.execute(
|
|
upload_id: upload.id,
|
|
target_id: persona.id,
|
|
target_type: persona.class.to_s,
|
|
)
|
|
}.to change(Jobs::GenerateRagEmbeddings.jobs, :size).by(1)
|
|
end
|
|
end
|
|
|
|
it "doesn't generate new fragments if we already processed the upload" do
|
|
Fabricate(:rag_document_fragment, upload: upload, target: persona)
|
|
|
|
previous_count = RagDocumentFragment.where(upload: upload, target: persona).count
|
|
|
|
subject.execute(upload_id: upload.id, target_id: persona.id, target_type: persona.class.to_s)
|
|
updated_count = RagDocumentFragment.where(upload: upload, target: persona).count
|
|
|
|
expect(updated_count).to eq(previous_count)
|
|
end
|
|
end
|
|
end
|