From 23d12c8927a32af8a860f4f665c089483f15ed41 Mon Sep 17 00:00:00 2001
From: Sam <sam.saffron@gmail.com>
Date: Thu, 11 Apr 2024 16:22:59 +1000
Subject: [PATCH] FEATURE: GPT-4 turbo vision support (#575)

Recent release of GPT-4 turbo adds vision support, this adds
the pipeline for sending images to Open AI.
---
 lib/completions/dialects/chat_gpt.rb          | 25 +++++++++
 .../lib/completions/endpoints/open_ai_spec.rb | 52 +++++++++++++++++++
 2 files changed, 77 insertions(+)

diff --git a/lib/completions/dialects/chat_gpt.rb b/lib/completions/dialects/chat_gpt.rb
index 915535f3..7368deff 100644
--- a/lib/completions/dialects/chat_gpt.rb
+++ b/lib/completions/dialects/chat_gpt.rb
@@ -65,6 +65,7 @@ module DiscourseAi
                   user_message[:name] = msg[:id]
                 end
               end
+              user_message[:content] = inline_images(user_message[:content], msg)
               user_message
             end
           end
@@ -106,6 +107,30 @@ module DiscourseAi
 
         private
 
+        def inline_images(content, message)
+          if model_name.include?("gpt-4-vision") || model_name == "gpt-4-turbo"
+            content = message[:content]
+            encoded_uploads = prompt.encoded_uploads(message)
+            if encoded_uploads.present?
+              new_content = []
+              new_content.concat(
+                encoded_uploads.map do |details|
+                  {
+                    type: "image_url",
+                    image_url: {
+                      url: "data:#{details[:mime_type]};base64,#{details[:base64]}",
+                    },
+                  }
+                end,
+              )
+              new_content << { type: "text", text: content }
+              content = new_content
+            end
+          end
+
+          content
+        end
+
         def per_message_overhead
           # open ai defines about 4 tokens per message of overhead
           4
diff --git a/spec/lib/completions/endpoints/open_ai_spec.rb b/spec/lib/completions/endpoints/open_ai_spec.rb
index fe662ae2..f0966f78 100644
--- a/spec/lib/completions/endpoints/open_ai_spec.rb
+++ b/spec/lib/completions/endpoints/open_ai_spec.rb
@@ -165,6 +165,58 @@ RSpec.describe DiscourseAi::Completions::Endpoints::OpenAi do
     EndpointsCompliance.new(self, endpoint, DiscourseAi::Completions::Dialects::ChatGpt, user)
   end
 
+  let(:image100x100) { plugin_file_from_fixtures("100x100.jpg") }
+  let(:upload100x100) do
+    UploadCreator.new(image100x100, "image.jpg").create_for(Discourse.system_user.id)
+  end
+
+  describe "image support" do
+    it "can handle images" do
+      llm = DiscourseAi::Completions::Llm.proxy("open_ai:gpt-4-turbo")
+      prompt =
+        DiscourseAi::Completions::Prompt.new(
+          "You are image bot",
+          messages: [type: :user, id: "user1", content: "hello", upload_ids: [upload100x100.id]],
+        )
+
+      encoded = prompt.encoded_uploads(prompt.messages.last)
+
+      parsed_body = nil
+
+      stub_request(:post, "https://api.openai.com/v1/chat/completions").with(
+        body:
+          proc do |req_body|
+            parsed_body = JSON.parse(req_body, symbolize_names: true)
+            true
+          end,
+      ).to_return(status: 200, body: { choices: [message: { content: "nice pic" }] }.to_json)
+
+      completion = llm.generate(prompt, user: user)
+
+      expect(completion).to eq("nice pic")
+      expected_body = {
+        model: "gpt-4-turbo",
+        messages: [
+          { role: "system", content: "You are image bot" },
+          {
+            role: "user",
+            content: [
+              {
+                type: "image_url",
+                image_url: {
+                  url: "data:#{encoded[0][:mime_type]};base64,#{encoded[0][:base64]}",
+                },
+              },
+              { type: "text", text: "hello" },
+            ],
+            name: "user1",
+          },
+        ],
+      }
+      expect(parsed_body).to eq(expected_body)
+    end
+  end
+
   describe "#perform_completion!" do
     context "when using regular mode" do
       context "with simple prompts" do