FEATURE: GPT-4 turbo vision support (#575)

Recent release of GPT-4 turbo adds vision support, this adds
the pipeline for sending images to Open AI.
This commit is contained in:
Sam 2024-04-11 16:22:59 +10:00 committed by GitHub
parent a77658e2b1
commit 23d12c8927
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 77 additions and 0 deletions

View File

@ -65,6 +65,7 @@ module DiscourseAi
user_message[:name] = msg[:id]
end
end
user_message[:content] = inline_images(user_message[:content], msg)
user_message
end
end
@ -106,6 +107,30 @@ module DiscourseAi
private
def inline_images(content, message)
if model_name.include?("gpt-4-vision") || model_name == "gpt-4-turbo"
content = message[:content]
encoded_uploads = prompt.encoded_uploads(message)
if encoded_uploads.present?
new_content = []
new_content.concat(
encoded_uploads.map do |details|
{
type: "image_url",
image_url: {
url: "data:#{details[:mime_type]};base64,#{details[:base64]}",
},
}
end,
)
new_content << { type: "text", text: content }
content = new_content
end
end
content
end
def per_message_overhead
# open ai defines about 4 tokens per message of overhead
4

View File

@ -165,6 +165,58 @@ RSpec.describe DiscourseAi::Completions::Endpoints::OpenAi do
EndpointsCompliance.new(self, endpoint, DiscourseAi::Completions::Dialects::ChatGpt, user)
end
let(:image100x100) { plugin_file_from_fixtures("100x100.jpg") }
let(:upload100x100) do
UploadCreator.new(image100x100, "image.jpg").create_for(Discourse.system_user.id)
end
describe "image support" do
it "can handle images" do
llm = DiscourseAi::Completions::Llm.proxy("open_ai:gpt-4-turbo")
prompt =
DiscourseAi::Completions::Prompt.new(
"You are image bot",
messages: [type: :user, id: "user1", content: "hello", upload_ids: [upload100x100.id]],
)
encoded = prompt.encoded_uploads(prompt.messages.last)
parsed_body = nil
stub_request(:post, "https://api.openai.com/v1/chat/completions").with(
body:
proc do |req_body|
parsed_body = JSON.parse(req_body, symbolize_names: true)
true
end,
).to_return(status: 200, body: { choices: [message: { content: "nice pic" }] }.to_json)
completion = llm.generate(prompt, user: user)
expect(completion).to eq("nice pic")
expected_body = {
model: "gpt-4-turbo",
messages: [
{ role: "system", content: "You are image bot" },
{
role: "user",
content: [
{
type: "image_url",
image_url: {
url: "data:#{encoded[0][:mime_type]};base64,#{encoded[0][:base64]}",
},
},
{ type: "text", text: "hello" },
],
name: "user1",
},
],
}
expect(parsed_body).to eq(expected_body)
end
end
describe "#perform_completion!" do
context "when using regular mode" do
context "with simple prompts" do