From 23d12c8927a32af8a860f4f665c089483f15ed41 Mon Sep 17 00:00:00 2001 From: Sam Date: Thu, 11 Apr 2024 16:22:59 +1000 Subject: [PATCH] FEATURE: GPT-4 turbo vision support (#575) Recent release of GPT-4 turbo adds vision support, this adds the pipeline for sending images to Open AI. --- lib/completions/dialects/chat_gpt.rb | 25 +++++++++ .../lib/completions/endpoints/open_ai_spec.rb | 52 +++++++++++++++++++ 2 files changed, 77 insertions(+) diff --git a/lib/completions/dialects/chat_gpt.rb b/lib/completions/dialects/chat_gpt.rb index 915535f3..7368deff 100644 --- a/lib/completions/dialects/chat_gpt.rb +++ b/lib/completions/dialects/chat_gpt.rb @@ -65,6 +65,7 @@ module DiscourseAi user_message[:name] = msg[:id] end end + user_message[:content] = inline_images(user_message[:content], msg) user_message end end @@ -106,6 +107,30 @@ module DiscourseAi private + def inline_images(content, message) + if model_name.include?("gpt-4-vision") || model_name == "gpt-4-turbo" + content = message[:content] + encoded_uploads = prompt.encoded_uploads(message) + if encoded_uploads.present? + new_content = [] + new_content.concat( + encoded_uploads.map do |details| + { + type: "image_url", + image_url: { + url: "data:#{details[:mime_type]};base64,#{details[:base64]}", + }, + } + end, + ) + new_content << { type: "text", text: content } + content = new_content + end + end + + content + end + def per_message_overhead # open ai defines about 4 tokens per message of overhead 4 diff --git a/spec/lib/completions/endpoints/open_ai_spec.rb b/spec/lib/completions/endpoints/open_ai_spec.rb index fe662ae2..f0966f78 100644 --- a/spec/lib/completions/endpoints/open_ai_spec.rb +++ b/spec/lib/completions/endpoints/open_ai_spec.rb @@ -165,6 +165,58 @@ RSpec.describe DiscourseAi::Completions::Endpoints::OpenAi do EndpointsCompliance.new(self, endpoint, DiscourseAi::Completions::Dialects::ChatGpt, user) end + let(:image100x100) { plugin_file_from_fixtures("100x100.jpg") } + let(:upload100x100) do + UploadCreator.new(image100x100, "image.jpg").create_for(Discourse.system_user.id) + end + + describe "image support" do + it "can handle images" do + llm = DiscourseAi::Completions::Llm.proxy("open_ai:gpt-4-turbo") + prompt = + DiscourseAi::Completions::Prompt.new( + "You are image bot", + messages: [type: :user, id: "user1", content: "hello", upload_ids: [upload100x100.id]], + ) + + encoded = prompt.encoded_uploads(prompt.messages.last) + + parsed_body = nil + + stub_request(:post, "https://api.openai.com/v1/chat/completions").with( + body: + proc do |req_body| + parsed_body = JSON.parse(req_body, symbolize_names: true) + true + end, + ).to_return(status: 200, body: { choices: [message: { content: "nice pic" }] }.to_json) + + completion = llm.generate(prompt, user: user) + + expect(completion).to eq("nice pic") + expected_body = { + model: "gpt-4-turbo", + messages: [ + { role: "system", content: "You are image bot" }, + { + role: "user", + content: [ + { + type: "image_url", + image_url: { + url: "data:#{encoded[0][:mime_type]};base64,#{encoded[0][:base64]}", + }, + }, + { type: "text", text: "hello" }, + ], + name: "user1", + }, + ], + } + expect(parsed_body).to eq(expected_body) + end + end + describe "#perform_completion!" do context "when using regular mode" do context "with simple prompts" do