FEATURE: Track if a model can do vision in the llm_models table (#725)

* FEATURE: Track if a model can do vision in the llm_models table * Data migration
2025-03-09 11:48:47 +00:00 · 2024-07-24 16:29:47 -03:00 · 2024-07-24 16:29:47 -03:00 · 5c196bca89
commit 5c196bca89
parent 06e239321b
25 changed files with 289 additions and 263 deletions
--- a/app/controllers/discourse_ai/admin/ai_llms_controller.rb
+++ b/app/controllers/discourse_ai/admin/ai_llms_controller.rb
@ -106,6 +106,7 @@ module DiscourseAi
            :max_prompt_tokens,
            :api_key,
            :enabled_chat_bot,
+            :vision_enabled,
          )

        provider = updating ? updating.provider : permitted[:provider]
--- a/app/models/llm_model.rb
+++ b/app/models/llm_model.rb
@ -124,4 +124,6 @@ end
 #  api_key           :string
 #  user_id           :integer
 #  enabled_chat_bot  :boolean          default(FALSE), not null
+#  provider_params   :jsonb
+#  vision_enabled    :boolean          default(FALSE), not null
 #
--- a/app/serializers/llm_model_serializer.rb
+++ b/app/serializers/llm_model_serializer.rb
@ -13,7 +13,8 @@ class LlmModelSerializer < ApplicationSerializer
             :url,
             :enabled_chat_bot,
             :shadowed_by_srv,
-             :provider_params
+             :provider_params,
+             :vision_enabled

  has_one :user, serializer: BasicUserSerializer, embed: :object

--- a/assets/javascripts/discourse/admin/models/ai-llm.js
+++ b/assets/javascripts/discourse/admin/models/ai-llm.js
@ -13,7 +13,8 @@ export default class AiLlm extends RestModel {
      "url",
      "api_key",
      "enabled_chat_bot",
-      "provider_params"
+      "provider_params",
+      "vision_enabled"
    );
  }

--- a/assets/javascripts/discourse/components/ai-llm-editor-form.gjs
+++ b/assets/javascripts/discourse/components/ai-llm-editor-form.gjs
@ -267,6 +267,14 @@ export default class AiLlmEditorForm extends Component {
          @content={{I18n.t "discourse_ai.llms.hints.max_prompt_tokens"}}
        />
      </div>
+      <div class="control-group ai-llm-editor__vision-enabled">
+        <Input @type="checkbox" @checked={{@model.vision_enabled}} />
+        <label>{{I18n.t "discourse_ai.llms.vision_enabled"}}</label>
+        <DTooltip
+          @icon="question-circle"
+          @content={{I18n.t "discourse_ai.llms.hints.vision_enabled"}}
+        />
+      </div>
      <div class="control-group">
        <DToggleSwitch
          class="ai-llm-editor__enabled-chat-bot"
--- a/assets/stylesheets/modules/llms/common/ai-llms-editor.scss
+++ b/assets/stylesheets/modules/llms/common/ai-llms-editor.scss
@ -41,4 +41,9 @@
    display: flex;
    align-items: center;
  }
+
+  &__vision-enabled {
+    display: flex;
+    align-items: flex-start;
+  }
 }
--- a/config/locales/client.en.yml
+++ b/config/locales/client.en.yml
@ -228,6 +228,7 @@ en:
        url: "URL of the service hosting the model"
        api_key: "API Key of the service hosting the model"
        enabled_chat_bot: "Allow AI Bot"
+        vision_enabled: "Vision enabled"
        ai_bot_user: "AI Bot User"
        save: "Save"
        edit: "Edit"
@ -252,6 +253,7 @@ en:
        hints:
          max_prompt_tokens: "Max numbers of tokens for the prompt. As a rule of thumb, this should be 50% of the model's context window."
          name: "We include this in the API call to specify which model we'll use."
+          vision_enabled: "If enabled, the AI will attempt to understand images. It depends on the model being used supporting vision. Supported by latest models from Anthropic, Google, and OpenAI."

        providers:
          aws_bedrock: "AWS Bedrock"
--- a/config/settings.yml
+++ b/config/settings.yml
@ -189,10 +189,13 @@ discourse_ai:
  ai_vllm_api_key: ""
  ai_llava_endpoint:
    default: ""
+    hidden: true
  ai_llava_endpoint_srv:
    default: ""
    hidden: true
-  ai_llava_api_key: ""
+  ai_llava_api_key: 
+    default: ""
+    hidden: true
  ai_strict_token_counting:
    default: false
    hidden: true
@ -254,7 +257,7 @@ discourse_ai:
      - "context_menu"
      - "image_caption"
  ai_helper_image_caption_model:
-    default: "llava"
+    default: ""
    type: enum
    enum: "DiscourseAi::Configuration::LlmVisionEnumerator"
  ai_auto_image_caption_allowed_groups:
--- a/db/migrate/20240719143453_llm_model_vision_enabled.rb
+++ b/db/migrate/20240719143453_llm_model_vision_enabled.rb
@ -0,0 +1,6 @@
+# frozen_string_literal: true
+class LlmModelVisionEnabled < ActiveRecord::Migration[7.1]
+  def change
+    add_column :llm_models, :vision_enabled, :boolean, default: false, null: false
+  end
+end
--- a/db/post_migrate/20240724174343_migrate_vision_llms.rb
+++ b/db/post_migrate/20240724174343_migrate_vision_llms.rb
@ -0,0 +1,44 @@
+# frozen_string_literal: true
+class MigrateVisionLlms < ActiveRecord::Migration[7.1]
+  def up
+    vision_models = %w[
+      claude-3-sonnet
+      claude-3-opus
+      claude-3-haiku
+      gpt-4-vision-preview
+      gpt-4-turbo
+      gpt-4o
+      gemini-1.5-pro
+      gemini-1.5-flash
+    ]
+
+    DB.exec(<<~SQL, names: vision_models)
+      UPDATE llm_models
+      SET vision_enabled = true
+      WHERE name IN (:names)
+    SQL
+
+    current_value =
+      DB.query_single(
+        "SELECT value FROM site_settings WHERE name = :setting_name",
+        setting_name: "ai_helper_image_caption_model",
+      ).first
+
+    if current_value && current_value != "llava"
+      llm_model =
+        DB.query_single("SELECT id FROM llm_models WHERE name = :model", model: current_value).first
+
+      if llm_model
+        DB.exec(<<~SQL, new: "custom:#{llm_model}") if llm_model
+          UPDATE site_settings
+          SET value = :new
+          WHERE name = 'ai_helper_image_caption_model'
+        SQL
+      end
+    end
+  end
+
+  def down
+    raise ActiveRecord::IrreversibleMigration
+  end
+end
--- a/lib/ai_helper/assistant.rb
+++ b/lib/ai_helper/assistant.rb
@ -143,47 +143,26 @@ module DiscourseAi
      end

      def generate_image_caption(upload, user)
-        if SiteSetting.ai_helper_image_caption_model == "llava"
-          image_base64 =
-            DiscourseAi::Completions::UploadEncoder.encode(
-              upload_ids: [upload.id],
-              max_pixels: 1_048_576,
-            ).first[
-              :base64
-            ]
-          parameters = {
-            input: {
-              image: "data:image/#{upload.extension};base64, #{image_base64}",
-              top_p: 1,
-              max_tokens: 1024,
-              temperature: 0.2,
-              prompt: "Please describe this image in a single sentence",
-            },
-          }
-
-          ::DiscourseAi::Inference::Llava.perform!(parameters).dig(:output).join
-        else
-          prompt =
-            DiscourseAi::Completions::Prompt.new(
-              "You are a bot specializing in image captioning.",
-              messages: [
-                {
-                  type: :user,
-                  content:
-                    "Describe this image in a single sentence#{custom_locale_instructions(user)}",
-                  upload_ids: [upload.id],
-                },
-              ],
-              skip_validations: true,
-            )
-
-          DiscourseAi::Completions::Llm.proxy(SiteSetting.ai_helper_image_caption_model).generate(
-            prompt,
-            user: user,
-            max_tokens: 1024,
-            feature_name: "image_caption",
+        prompt =
+          DiscourseAi::Completions::Prompt.new(
+            "You are a bot specializing in image captioning.",
+            messages: [
+              {
+                type: :user,
+                content:
+                  "Describe this image in a single sentence#{custom_locale_instructions(user)}",
+                upload_ids: [upload.id],
+              },
+            ],
+            skip_validations: true,
          )
-        end
+
+        DiscourseAi::Completions::Llm.proxy(SiteSetting.ai_helper_image_caption_model).generate(
+          prompt,
+          user: user,
+          max_tokens: 1024,
+          feature_name: "image_caption",
+        )
      end

      private
--- a/lib/completions/dialects/chat_gpt.rb
+++ b/lib/completions/dialects/chat_gpt.rb
@ -78,33 +78,25 @@ module DiscourseAi
            end
          end

-          user_message[:content] = inline_images(user_message[:content], msg)
+          user_message[:content] = inline_images(user_message[:content], msg) if vision_support?
          user_message
        end

        def inline_images(content, message)
-          if model_name.include?("gpt-4-vision") || model_name == "gpt-4-turbo" ||
-               model_name == "gpt-4o"
-            content = message[:content]
-            encoded_uploads = prompt.encoded_uploads(message)
-            if encoded_uploads.present?
-              new_content = []
-              new_content.concat(
-                encoded_uploads.map do |details|
-                  {
-                    type: "image_url",
-                    image_url: {
-                      url: "data:#{details[:mime_type]};base64,#{details[:base64]}",
-                    },
-                  }
-                end,
-              )
-              new_content << { type: "text", text: content }
-              content = new_content
-            end
-          end
+          encoded_uploads = prompt.encoded_uploads(message)
+          return content if encoded_uploads.blank?

-          content
+          content_w_imgs =
+            encoded_uploads.reduce([]) do |memo, details|
+              memo << {
+                type: "image_url",
+                image_url: {
+                  url: "data:#{details[:mime_type]};base64,#{details[:base64]}",
+                },
+              }
+            end
+
+          content_w_imgs << { type: "text", text: message[:content] }
        end

        def per_message_overhead
--- a/lib/completions/dialects/claude.rb
+++ b/lib/completions/dialects/claude.rb
@ -109,34 +109,28 @@ module DiscourseAi
          content = +""
          content << "#{msg[:id]}: " if msg[:id]
          content << msg[:content]
-          content = inline_images(content, msg)
+          content = inline_images(content, msg) if vision_support?

          { role: "user", content: content }
        end

        def inline_images(content, message)
-          if model_name.include?("claude-3")
-            encoded_uploads = prompt.encoded_uploads(message)
-            if encoded_uploads.present?
-              new_content = []
-              new_content.concat(
-                encoded_uploads.map do |details|
-                  {
-                    source: {
-                      type: "base64",
-                      data: details[:base64],
-                      media_type: details[:mime_type],
-                    },
-                    type: "image",
-                  }
-                end,
-              )
-              new_content << { type: "text", text: content }
-              content = new_content
-            end
-          end
+          encoded_uploads = prompt.encoded_uploads(message)
+          return content if encoded_uploads.blank?

-          content
+          content_w_imgs =
+            encoded_uploads.reduce([]) do |memo, details|
+              memo << {
+                source: {
+                  type: "base64",
+                  data: details[:base64],
+                  media_type: details[:mime_type],
+                },
+                type: "image",
+              }
+            end
+
+          content_w_imgs << { type: "text", text: content }
        end
      end
    end
--- a/lib/completions/dialects/dialect.rb
+++ b/lib/completions/dialects/dialect.rb
@ -56,6 +56,10 @@ module DiscourseAi
          false
        end

+        def vision_support?
+          llm_model&.vision_enabled?
+        end
+
        def tools
          @tools ||= tools_dialect.translated_tools
        end
--- a/lib/completions/dialects/gemini.rb
+++ b/lib/completions/dialects/gemini.rb
@ -114,6 +114,8 @@ module DiscourseAi
          if beta_api?
            # support new format with multiple parts
            result = { role: "user", parts: [{ text: msg[:content] }] }
+            return result unless vision_support?
+
            upload_parts = uploaded_parts(msg)
            result[:parts].concat(upload_parts) if upload_parts.present?
            result
--- a/lib/completions/dialects/open_ai_compatible.rb
+++ b/lib/completions/dialects/open_ai_compatible.rb
@ -47,7 +47,28 @@ module DiscourseAi
          content << "#{msg[:id]}: " if msg[:id]
          content << msg[:content]

-          { role: "user", content: content }
+          message = { role: "user", content: content }
+
+          message[:content] = inline_images(message[:content], msg) if vision_support?
+
+          message
+        end
+
+        def inline_images(content, message)
+          encoded_uploads = prompt.encoded_uploads(message)
+          return content if encoded_uploads.blank?
+
+          content_w_imgs =
+            encoded_uploads.reduce([]) do |memo, details|
+              memo << {
+                type: "image_url",
+                image_url: {
+                  url: "data:#{details[:mime_type]};base64,#{details[:base64]}",
+                },
+              }
+            end
+
+          content_w_imgs << { type: "text", text: message[:content] }
        end
      end
    end
--- a/lib/completions/endpoints/canned_response.rb
+++ b/lib/completions/endpoints/canned_response.rb
@ -35,6 +35,8 @@ module DiscourseAi
                  "The number of completions you requested exceed the number of canned responses"
          end

+          raise response if response.is_a?(StandardError)
+
          @completions += 1
          if block_given?
            cancelled = false
--- a/lib/completions/llm.rb
+++ b/lib/completions/llm.rb
@ -89,15 +89,6 @@ module DiscourseAi
          DiscourseAi::Tokenizer::BasicTokenizer.available_llm_tokenizers.map(&:name)
        end

-        def vision_models_by_provider
-          @vision_models_by_provider ||= {
-            aws_bedrock: %w[claude-3-sonnet claude-3-opus claude-3-haiku],
-            anthropic: %w[claude-3-sonnet claude-3-opus claude-3-haiku],
-            open_ai: %w[gpt-4-vision-preview gpt-4-turbo gpt-4o],
-            google: %w[gemini-1.5-pro gemini-1.5-flash],
-          }
-        end
-
        def models_by_provider
          # ChatGPT models are listed under open_ai but they are actually available through OpenAI and Azure.
          # However, since they use the same URL/key settings, there's no reason to duplicate them.
--- a/lib/configuration/llm_vision_enumerator.rb
+++ b/lib/configuration/llm_vision_enumerator.rb
@ -10,24 +10,15 @@ module DiscourseAi
      end

      def self.values
-        begin
-          result =
-            DiscourseAi::Completions::Llm.vision_models_by_provider.flat_map do |provider, models|
-              endpoint = DiscourseAi::Completions::Endpoints::Base.endpoint_for(provider.to_s)
+        values = DB.query_hash(<<~SQL).map(&:symbolize_keys)
+          SELECT display_name AS name, id AS value
+          FROM llm_models
+          WHERE vision_enabled
+        SQL

-              models.map do |model_name|
-                { name: endpoint.display_name(model_name), value: "#{provider}:#{model_name}" }
-              end
-            end
+        values.each { |value_h| value_h[:value] = "custom:#{value_h[:value]}" }

-          result << { name: "Llava", value: "llava" }
-
-          result
-          # TODO add support for LlmModel as well
-          # LlmModel.all.each do |model|
-          #  llm_models << { name: model.display_name, value: "custom:#{model.id}" }
-          # end
-        end
+        values
      end
    end
  end
--- a/lib/inference/llava.rb
+++ b/lib/inference/llava.rb
@ -1,31 +0,0 @@
-# frozen_string_literal: true
-
-module ::DiscourseAi
-  module Inference
-    class Llava
-      def self.perform!(content)
-        headers = { "Referer" => Discourse.base_url, "Content-Type" => "application/json" }
-        body = content.to_json
-
-        if SiteSetting.ai_llava_endpoint_srv.present?
-          service = DiscourseAi::Utils::DnsSrv.lookup(SiteSetting.ai_llava_endpoint_srv)
-          api_endpoint = "https://#{service.target}:#{service.port}"
-        else
-          api_endpoint = SiteSetting.ai_llava_endpoint
-        end
-
-        headers["X-API-KEY"] = SiteSetting.ai_llava_api_key if SiteSetting.ai_llava_api_key.present?
-
-        response = Faraday.post("#{api_endpoint}/predictions", body, headers)
-
-        raise Net::HTTPBadResponse if ![200].include?(response.status)
-
-        JSON.parse(response.body, symbolize_names: true)
-      end
-
-      def self.configured?
-        SiteSetting.ai_llava_endpoint.present? || SiteSetting.ai_llava_endpoint_srv.present?
-      end
-    end
-  end
-end
--- a/spec/lib/completions/endpoints/anthropic_spec.rb
+++ b/spec/lib/completions/endpoints/anthropic_spec.rb
@ -2,7 +2,18 @@
 require_relative "endpoint_compliance"

 RSpec.describe DiscourseAi::Completions::Endpoints::Anthropic do
-  let(:llm) { DiscourseAi::Completions::Llm.proxy("anthropic:claude-3-opus") }
+  let(:url) { "https://api.anthropic.com/v1/messages" }
+  fab!(:model) do
+    Fabricate(
+      :llm_model,
+      url: "https://api.anthropic.com/v1/messages",
+      name: "claude-3-opus",
+      provider: "anthropic",
+      api_key: "123",
+      vision_enabled: true,
+    )
+  end
+  let(:llm) { DiscourseAi::Completions::Llm.proxy("custom:#{model.id}") }
  let(:image100x100) { plugin_file_from_fixtures("100x100.jpg") }
  let(:upload100x100) do
    UploadCreator.new(image100x100, "image.jpg").create_for(Discourse.system_user.id)
@ -45,8 +56,6 @@ RSpec.describe DiscourseAi::Completions::Endpoints::Anthropic do
    prompt_with_tools
  end

-  before { SiteSetting.ai_anthropic_api_key = "123" }
-
  it "does not eat spaces with tool calls" do
    SiteSetting.ai_anthropic_native_tool_call_models = "claude-3-opus"
    body = <<~STRING
@ -108,10 +117,7 @@ RSpec.describe DiscourseAi::Completions::Endpoints::Anthropic do
    result = +""
    body = body.scan(/.*\n/)
    EndpointMock.with_chunk_array_support do
-      stub_request(:post, "https://api.anthropic.com/v1/messages").to_return(
-        status: 200,
-        body: body,
-      )
+      stub_request(:post, url).to_return(status: 200, body: body)

      llm.generate(prompt_with_google_tool, user: Discourse.system_user) do |partial|
        result << partial
@ -161,7 +167,7 @@ RSpec.describe DiscourseAi::Completions::Endpoints::Anthropic do

    parsed_body = nil

-    stub_request(:post, "https://api.anthropic.com/v1/messages").with(
+    stub_request(:post, url).with(
      body:
        proc do |req_body|
          parsed_body = JSON.parse(req_body, symbolize_names: true)
@ -244,7 +250,7 @@ RSpec.describe DiscourseAi::Completions::Endpoints::Anthropic do
      },
    }.to_json

-    stub_request(:post, "https://api.anthropic.com/v1/messages").to_return(body: body)
+    stub_request(:post, url).to_return(body: body)

    result = proxy.generate(prompt, user: Discourse.system_user)

@ -314,7 +320,7 @@ RSpec.describe DiscourseAi::Completions::Endpoints::Anthropic do
    STRING

    requested_body = nil
-    stub_request(:post, "https://api.anthropic.com/v1/messages").with(
+    stub_request(:post, url).with(
      body:
        proc do |req_body|
          requested_body = JSON.parse(req_body, symbolize_names: true)
@ -351,7 +357,7 @@ RSpec.describe DiscourseAi::Completions::Endpoints::Anthropic do
    STRING

    parsed_body = nil
-    stub_request(:post, "https://api.anthropic.com/v1/messages").with(
+    stub_request(:post, url).with(
      body:
        proc do |req_body|
          parsed_body = JSON.parse(req_body, symbolize_names: true)
--- a/spec/lib/completions/endpoints/gemini_spec.rb
+++ b/spec/lib/completions/endpoints/gemini_spec.rb
@ -130,6 +130,17 @@ end
 RSpec.describe DiscourseAi::Completions::Endpoints::Gemini do
  subject(:endpoint) { described_class.new("gemini-pro", DiscourseAi::Tokenizer::OpenAiTokenizer) }

+  fab!(:model) do
+    Fabricate(
+      :llm_model,
+      url: "https://generativelanguage.googleapis.com/v1beta/models/gemini-1.5-pro-latest",
+      name: "gemini-1.5-pro",
+      provider: "google",
+      api_key: "ABC",
+      vision_enabled: true,
+    )
+  end
+
  fab!(:user)

  let(:image100x100) { plugin_file_from_fixtures("100x100.jpg") }
@ -144,8 +155,6 @@ RSpec.describe DiscourseAi::Completions::Endpoints::Gemini do
  end

  it "Supports Vision API" do
-    SiteSetting.ai_gemini_api_key = "ABC"
-
    prompt =
      DiscourseAi::Completions::Prompt.new(
        "You are image bot",
@ -158,9 +167,8 @@ RSpec.describe DiscourseAi::Completions::Endpoints::Gemini do

    req_body = nil

-    llm = DiscourseAi::Completions::Llm.proxy("google:gemini-1.5-pro")
-    url =
-      "https://generativelanguage.googleapis.com/v1beta/models/gemini-1.5-pro-latest:generateContent?key=ABC"
+    llm = DiscourseAi::Completions::Llm.proxy("custom:#{model.id}")
+    url = "#{model.url}:generateContent?key=ABC"

    stub_request(:post, url).with(
      body:
@ -202,8 +210,6 @@ RSpec.describe DiscourseAi::Completions::Endpoints::Gemini do
  end

  it "Can correctly handle streamed responses even if they are chunked badly" do
-    SiteSetting.ai_gemini_api_key = "ABC"
-
    data = +""
    data << "da|ta: |"
    data << gemini_mock.response("Hello").to_json
@ -214,9 +220,8 @@ RSpec.describe DiscourseAi::Completions::Endpoints::Gemini do

    split = data.split("|")

-    llm = DiscourseAi::Completions::Llm.proxy("google:gemini-1.5-flash")
-    url =
-      "https://generativelanguage.googleapis.com/v1beta/models/gemini-1.5-flash-latest:streamGenerateContent?alt=sse&key=ABC"
+    llm = DiscourseAi::Completions::Llm.proxy("custom:#{model.id}")
+    url = "#{model.url}:streamGenerateContent?alt=sse&key=ABC"

    output = +""
    gemini_mock.with_chunk_array_support do
--- a/spec/lib/completions/endpoints/open_ai_spec.rb
+++ b/spec/lib/completions/endpoints/open_ai_spec.rb
@ -258,7 +258,8 @@ RSpec.describe DiscourseAi::Completions::Endpoints::OpenAi do

  describe "image support" do
    it "can handle images" do
-      llm = DiscourseAi::Completions::Llm.proxy("open_ai:gpt-4-turbo")
+      model = Fabricate(:llm_model, provider: "open_ai", vision_enabled: true)
+      llm = DiscourseAi::Completions::Llm.proxy("custom:#{model.id}")
      prompt =
        DiscourseAi::Completions::Prompt.new(
          "You are image bot",
--- a/spec/requests/ai_helper/assistant_controller_spec.rb
+++ b/spec/requests/ai_helper/assistant_controller_spec.rb
@ -112,43 +112,40 @@ RSpec.describe DiscourseAi::AiHelper::AssistantController do
      "A picture of a cat sitting on a table (#{I18n.t("discourse_ai.ai_helper.image_caption.attribution")})"
    end

+    before { assign_fake_provider_to(:ai_helper_image_caption_model) }
+
+    def request_caption(params)
+      DiscourseAi::Completions::Llm.with_prepared_responses([caption]) do
+        post "/discourse-ai/ai-helper/caption_image", params: params
+
+        yield(response)
+      end
+    end
+
    context "when logged in as an allowed user" do
      fab!(:user) { Fabricate(:user, refresh_auto_groups: true) }

      before do
        sign_in(user)
-        SiteSetting.ai_helper_allowed_groups = Group::AUTO_GROUPS[:trust_level_1]
-        SiteSetting.ai_llava_endpoint = "https://example.com"

-        stub_request(:post, "https://example.com/predictions").to_return(
-          status: 200,
-          body: { output: caption.gsub(" ", " |").split("|") }.to_json,
-        )
+        SiteSetting.ai_helper_allowed_groups = Group::AUTO_GROUPS[:trust_level_1]
      end

      it "returns the suggested caption for the image" do
-        post "/discourse-ai/ai-helper/caption_image",
-             params: {
-               image_url: image_url,
-               image_url_type: "long_url",
-             }
-
-        expect(response.status).to eq(200)
-        expect(response.parsed_body["caption"]).to eq(caption_with_attrs)
+        request_caption({ image_url: image_url, image_url_type: "long_url" }) do |r|
+          expect(r.status).to eq(200)
+          expect(r.parsed_body["caption"]).to eq(caption_with_attrs)
+        end
      end

      context "when the image_url is a short_url" do
        let(:image_url) { upload.short_url }

        it "returns the suggested caption for the image" do
-          post "/discourse-ai/ai-helper/caption_image",
-               params: {
-                 image_url: image_url,
-                 image_url_type: "short_url",
-               }
-
-          expect(response.status).to eq(200)
-          expect(response.parsed_body["caption"]).to eq(caption_with_attrs)
+          request_caption({ image_url: image_url, image_url_type: "short_url" }) do |r|
+            expect(r.status).to eq(200)
+            expect(r.parsed_body["caption"]).to eq(caption_with_attrs)
+          end
        end
      end

@ -156,27 +153,25 @@ RSpec.describe DiscourseAi::AiHelper::AssistantController do
        let(:image_url) { "#{Discourse.base_url}#{upload.short_path}" }

        it "returns the suggested caption for the image" do
-          post "/discourse-ai/ai-helper/caption_image",
-               params: {
-                 image_url: image_url,
-                 image_url_type: "short_path",
-               }
-
-          expect(response.status).to eq(200)
-          expect(response.parsed_body["caption"]).to eq(caption_with_attrs)
+          request_caption({ image_url: image_url, image_url_type: "short_path" }) do |r|
+            expect(r.status).to eq(200)
+            expect(r.parsed_body["caption"]).to eq(caption_with_attrs)
+          end
        end
      end

      it "returns a 502 error when the completion call fails" do
-        stub_request(:post, "https://example.com/predictions").to_return(status: 502)
+        DiscourseAi::Completions::Llm.with_prepared_responses(
+          [DiscourseAi::Completions::Endpoints::Base::CompletionFailed.new],
+        ) do
+          post "/discourse-ai/ai-helper/caption_image",
+               params: {
+                 image_url: image_url,
+                 image_url_type: "long_url",
+               }

-        post "/discourse-ai/ai-helper/caption_image",
-             params: {
-               image_url: image_url,
-               image_url_type: "long_url",
-             }
-
-        expect(response.status).to eq(502)
+          expect(response.status).to eq(502)
+        end
      end

      it "returns a 400 error when the image_url is blank" do
@ -211,9 +206,10 @@ RSpec.describe DiscourseAi::AiHelper::AssistantController do
          SiteSetting.provider = SiteSettings::DbProvider.new(SiteSetting)
          setup_s3
          stub_s3_store
+          assign_fake_provider_to(:ai_helper_image_caption_model)
          SiteSetting.secure_uploads = true
          SiteSetting.ai_helper_allowed_groups = Group::AUTO_GROUPS[:trust_level_1]
-          SiteSetting.ai_llava_endpoint = "https://example.com"
+
          Group.find(SiteSetting.ai_helper_allowed_groups_map.first).add(user)
          user.reload

@ -242,14 +238,11 @@ RSpec.describe DiscourseAi::AiHelper::AssistantController do

        it "returns a 200 message and caption if user can access the secure upload" do
          group.add(user)
-          post "/discourse-ai/ai-helper/caption_image",
-               params: {
-                 image_url: image_url,
-                 image_url_type: "long_url",
-               }

-          expect(response.status).to eq(200)
-          expect(response.parsed_body["caption"]).to eq(caption_with_attrs)
+          request_caption({ image_url: image_url, image_url_type: "long_url" }) do |r|
+            expect(r.status).to eq(200)
+            expect(r.parsed_body["caption"]).to eq(caption_with_attrs)
+          end
        end

        context "if the input URL is for a secure upload but not on the secure-uploads path" do
@ -257,13 +250,11 @@ RSpec.describe DiscourseAi::AiHelper::AssistantController do

          it "creates a signed URL properly and makes the caption" do
            group.add(user)
-            post "/discourse-ai/ai-helper/caption_image",
-                 params: {
-                   image_url: image_url,
-                   image_url_type: "long_url",
-                 }
-            expect(response.status).to eq(200)
-            expect(response.parsed_body["caption"]).to eq(caption_with_attrs)
+
+            request_caption({ image_url: image_url, image_url_type: "long_url" }) do |r|
+              expect(r.status).to eq(200)
+              expect(r.parsed_body["caption"]).to eq(caption_with_attrs)
+            end
          end
        end
      end
--- a/spec/system/ai_helper/ai_image_caption_spec.rb
+++ b/spec/system/ai_helper/ai_image_caption_spec.rb
@ -21,14 +21,9 @@ RSpec.describe "AI image caption", type: :system, js: true do
  before do
    Group.find_by(id: Group::AUTO_GROUPS[:admins]).add(user)
    assign_fake_provider_to(:ai_helper_model)
-    SiteSetting.ai_llava_endpoint = "https://example.com"
+    assign_fake_provider_to(:ai_helper_image_caption_model)
    SiteSetting.ai_helper_enabled_features = "image_caption"
    sign_in(user)
-
-    stub_request(:post, "https://example.com/predictions").to_return(
-      status: 200,
-      body: { output: caption.gsub(" ", " |").split("|") }.to_json,
-    )
  end

  shared_examples "shows no image caption button" do
@ -53,35 +48,41 @@ RSpec.describe "AI image caption", type: :system, js: true do

  context "when triggering caption with AI on desktop" do
    it "should show an image caption in an input field" do
-      visit("/latest")
-      page.find("#create-topic").click
-      attach_file([file_path]) { composer.click_toolbar_button("upload") }
-      popup.click_generate_caption
-      expect(popup.has_caption_popup_value?(caption_with_attrs)).to eq(true)
-      popup.save_caption
-      wait_for { page.find(".image-wrapper img")["alt"] == caption_with_attrs }
-      expect(page.find(".image-wrapper img")["alt"]).to eq(caption_with_attrs)
+      DiscourseAi::Completions::Llm.with_prepared_responses([caption]) do
+        visit("/latest")
+        page.find("#create-topic").click
+        attach_file([file_path]) { composer.click_toolbar_button("upload") }
+        popup.click_generate_caption
+        expect(popup.has_caption_popup_value?(caption_with_attrs)).to eq(true)
+        popup.save_caption
+        wait_for { page.find(".image-wrapper img")["alt"] == caption_with_attrs }
+        expect(page.find(".image-wrapper img")["alt"]).to eq(caption_with_attrs)
+      end
    end

    it "should allow you to cancel a caption request" do
-      visit("/latest")
-      page.find("#create-topic").click
-      attach_file([file_path]) { composer.click_toolbar_button("upload") }
-      popup.click_generate_caption
-      popup.cancel_caption
-      expect(popup).to have_no_disabled_generate_button
+      DiscourseAi::Completions::Llm.with_prepared_responses([caption]) do
+        visit("/latest")
+        page.find("#create-topic").click
+        attach_file([file_path]) { composer.click_toolbar_button("upload") }
+        popup.click_generate_caption
+        popup.cancel_caption
+        expect(popup).to have_no_disabled_generate_button
+      end
    end
  end

  context "when triggering caption with AI on mobile", mobile: true do
    it "should show update the image alt text with the caption" do
-      visit("/latest")
-      page.find("#create-topic").click
-      attach_file([file_path]) { page.find(".mobile-file-upload").click }
-      page.find(".mobile-preview").click
-      popup.click_generate_caption
-      wait_for { page.find(".image-wrapper img")["alt"] == caption_with_attrs }
-      expect(page.find(".image-wrapper img")["alt"]).to eq(caption_with_attrs)
+      DiscourseAi::Completions::Llm.with_prepared_responses([caption]) do
+        visit("/latest")
+        page.find("#create-topic").click
+        attach_file([file_path]) { page.find(".mobile-file-upload").click }
+        page.find(".mobile-preview").click
+        popup.click_generate_caption
+        wait_for { page.find(".image-wrapper img")["alt"] == caption_with_attrs }
+        expect(page.find(".image-wrapper img")["alt"]).to eq(caption_with_attrs)
+      end
    end
  end

@ -125,15 +126,17 @@ RSpec.describe "AI image caption", type: :system, js: true do
      end

      it "should auto caption the existing images and update the preference when dialog is accepted" do
-        visit("/latest")
-        page.find("#create-topic").click
-        attach_file([file_path]) { composer.click_toolbar_button("upload") }
-        wait_for { composer.has_no_in_progress_uploads? }
-        composer.fill_title("I love using Discourse! It is my favorite forum software")
-        composer.create
-        dialog.click_yes
-        wait_for(timeout: 100) { page.find("#post_1 .cooked img")["alt"] == caption_with_attrs }
-        expect(page.find("#post_1 .cooked img")["alt"]).to eq(caption_with_attrs)
+        DiscourseAi::Completions::Llm.with_prepared_responses([caption]) do
+          visit("/latest")
+          page.find("#create-topic").click
+          attach_file([file_path]) { composer.click_toolbar_button("upload") }
+          wait_for { composer.has_no_in_progress_uploads? }
+          composer.fill_title("I love using Discourse! It is my favorite forum software")
+          composer.create
+          dialog.click_yes
+          wait_for(timeout: 100) { page.find("#post_1 .cooked img")["alt"] == caption_with_attrs }
+          expect(page.find("#post_1 .cooked img")["alt"]).to eq(caption_with_attrs)
+        end
      end
    end

@ -142,14 +145,16 @@ RSpec.describe "AI image caption", type: :system, js: true do

      skip "TODO: Fix auto_image_caption user option not present in testing environment?" do
        it "should auto caption the image after uploading" do
-          visit("/latest")
-          page.find("#create-topic").click
-          attach_file([Rails.root.join("spec/fixtures/images/logo.jpg")]) do
-            composer.click_toolbar_button("upload")
+          DiscourseAi::Completions::Llm.with_prepared_responses([caption]) do
+            visit("/latest")
+            page.find("#create-topic").click
+            attach_file([Rails.root.join("spec/fixtures/images/logo.jpg")]) do
+              composer.click_toolbar_button("upload")
+            end
+            wait_for { composer.has_no_in_progress_uploads? }
+            wait_for { page.find(".image-wrapper img")["alt"] == caption_with_attrs }
+            expect(page.find(".image-wrapper img")["alt"]).to eq(caption_with_attrs)
          end
-          wait_for { composer.has_no_in_progress_uploads? }
-          wait_for { page.find(".image-wrapper img")["alt"] == caption_with_attrs }
-          expect(page.find(".image-wrapper img")["alt"]).to eq(caption_with_attrs)
        end
      end
    end