diff --git a/app/controllers/discourse_ai/admin/ai_llms_controller.rb b/app/controllers/discourse_ai/admin/ai_llms_controller.rb
index 9098f305..72cd7b13 100644
--- a/app/controllers/discourse_ai/admin/ai_llms_controller.rb
+++ b/app/controllers/discourse_ai/admin/ai_llms_controller.rb
@@ -106,6 +106,7 @@ module DiscourseAi
             :max_prompt_tokens,
             :api_key,
             :enabled_chat_bot,
+            :vision_enabled,
           )
 
         provider = updating ? updating.provider : permitted[:provider]
diff --git a/app/models/llm_model.rb b/app/models/llm_model.rb
index 180a7a84..73666dd5 100644
--- a/app/models/llm_model.rb
+++ b/app/models/llm_model.rb
@@ -124,4 +124,6 @@ end
 #  api_key           :string
 #  user_id           :integer
 #  enabled_chat_bot  :boolean          default(FALSE), not null
+#  provider_params   :jsonb
+#  vision_enabled    :boolean          default(FALSE), not null
 #
diff --git a/app/serializers/llm_model_serializer.rb b/app/serializers/llm_model_serializer.rb
index 268f41b2..71a1c1b9 100644
--- a/app/serializers/llm_model_serializer.rb
+++ b/app/serializers/llm_model_serializer.rb
@@ -13,7 +13,8 @@ class LlmModelSerializer < ApplicationSerializer
              :url,
              :enabled_chat_bot,
              :shadowed_by_srv,
-             :provider_params
+             :provider_params,
+             :vision_enabled
 
   has_one :user, serializer: BasicUserSerializer, embed: :object
 
diff --git a/assets/javascripts/discourse/admin/models/ai-llm.js b/assets/javascripts/discourse/admin/models/ai-llm.js
index e81d0d04..8545ee6b 100644
--- a/assets/javascripts/discourse/admin/models/ai-llm.js
+++ b/assets/javascripts/discourse/admin/models/ai-llm.js
@@ -13,7 +13,8 @@ export default class AiLlm extends RestModel {
       "url",
       "api_key",
       "enabled_chat_bot",
-      "provider_params"
+      "provider_params",
+      "vision_enabled"
     );
   }
 
diff --git a/assets/javascripts/discourse/components/ai-llm-editor-form.gjs b/assets/javascripts/discourse/components/ai-llm-editor-form.gjs
index 11a198fc..20ce95db 100644
--- a/assets/javascripts/discourse/components/ai-llm-editor-form.gjs
+++ b/assets/javascripts/discourse/components/ai-llm-editor-form.gjs
@@ -267,6 +267,14 @@ export default class AiLlmEditorForm extends Component {
           @content={{I18n.t "discourse_ai.llms.hints.max_prompt_tokens"}}
         />
       </div>
+      <div class="control-group ai-llm-editor__vision-enabled">
+        <Input @type="checkbox" @checked={{@model.vision_enabled}} />
+        <label>{{I18n.t "discourse_ai.llms.vision_enabled"}}</label>
+        <DTooltip
+          @icon="question-circle"
+          @content={{I18n.t "discourse_ai.llms.hints.vision_enabled"}}
+        />
+      </div>
       <div class="control-group">
         <DToggleSwitch
           class="ai-llm-editor__enabled-chat-bot"
diff --git a/assets/stylesheets/modules/llms/common/ai-llms-editor.scss b/assets/stylesheets/modules/llms/common/ai-llms-editor.scss
index 2b0cee70..53aeae3a 100644
--- a/assets/stylesheets/modules/llms/common/ai-llms-editor.scss
+++ b/assets/stylesheets/modules/llms/common/ai-llms-editor.scss
@@ -41,4 +41,9 @@
     display: flex;
     align-items: center;
   }
+
+  &__vision-enabled {
+    display: flex;
+    align-items: flex-start;
+  }
 }
diff --git a/config/locales/client.en.yml b/config/locales/client.en.yml
index 039ca3eb..675ace23 100644
--- a/config/locales/client.en.yml
+++ b/config/locales/client.en.yml
@@ -228,6 +228,7 @@ en:
         url: "URL of the service hosting the model"
         api_key: "API Key of the service hosting the model"
         enabled_chat_bot: "Allow AI Bot"
+        vision_enabled: "Vision enabled"
         ai_bot_user: "AI Bot User"
         save: "Save"
         edit: "Edit"
@@ -252,6 +253,7 @@ en:
         hints:
           max_prompt_tokens: "Max numbers of tokens for the prompt. As a rule of thumb, this should be 50% of the model's context window."
           name: "We include this in the API call to specify which model we'll use."
+          vision_enabled: "If enabled, the AI will attempt to understand images. It depends on the model being used supporting vision. Supported by latest models from Anthropic, Google, and OpenAI."
 
         providers:
           aws_bedrock: "AWS Bedrock"
diff --git a/config/settings.yml b/config/settings.yml
index f890ad8f..fa23c6cc 100644
--- a/config/settings.yml
+++ b/config/settings.yml
@@ -189,10 +189,13 @@ discourse_ai:
   ai_vllm_api_key: ""
   ai_llava_endpoint:
     default: ""
+    hidden: true
   ai_llava_endpoint_srv:
     default: ""
     hidden: true
-  ai_llava_api_key: ""
+  ai_llava_api_key: 
+    default: ""
+    hidden: true
   ai_strict_token_counting:
     default: false
     hidden: true
@@ -254,7 +257,7 @@ discourse_ai:
       - "context_menu"
       - "image_caption"
   ai_helper_image_caption_model:
-    default: "llava"
+    default: ""
     type: enum
     enum: "DiscourseAi::Configuration::LlmVisionEnumerator"
   ai_auto_image_caption_allowed_groups:
diff --git a/db/migrate/20240719143453_llm_model_vision_enabled.rb b/db/migrate/20240719143453_llm_model_vision_enabled.rb
new file mode 100644
index 00000000..c2280818
--- /dev/null
+++ b/db/migrate/20240719143453_llm_model_vision_enabled.rb
@@ -0,0 +1,6 @@
+# frozen_string_literal: true
+class LlmModelVisionEnabled < ActiveRecord::Migration[7.1]
+  def change
+    add_column :llm_models, :vision_enabled, :boolean, default: false, null: false
+  end
+end
diff --git a/db/post_migrate/20240724174343_migrate_vision_llms.rb b/db/post_migrate/20240724174343_migrate_vision_llms.rb
new file mode 100644
index 00000000..a57788b5
--- /dev/null
+++ b/db/post_migrate/20240724174343_migrate_vision_llms.rb
@@ -0,0 +1,44 @@
+# frozen_string_literal: true
+class MigrateVisionLlms < ActiveRecord::Migration[7.1]
+  def up
+    vision_models = %w[
+      claude-3-sonnet
+      claude-3-opus
+      claude-3-haiku
+      gpt-4-vision-preview
+      gpt-4-turbo
+      gpt-4o
+      gemini-1.5-pro
+      gemini-1.5-flash
+    ]
+
+    DB.exec(<<~SQL, names: vision_models)
+      UPDATE llm_models
+      SET vision_enabled = true
+      WHERE name IN (:names)
+    SQL
+
+    current_value =
+      DB.query_single(
+        "SELECT value FROM site_settings WHERE name = :setting_name",
+        setting_name: "ai_helper_image_caption_model",
+      ).first
+
+    if current_value && current_value != "llava"
+      llm_model =
+        DB.query_single("SELECT id FROM llm_models WHERE name = :model", model: current_value).first
+
+      if llm_model
+        DB.exec(<<~SQL, new: "custom:#{llm_model}") if llm_model
+          UPDATE site_settings
+          SET value = :new
+          WHERE name = 'ai_helper_image_caption_model'
+        SQL
+      end
+    end
+  end
+
+  def down
+    raise ActiveRecord::IrreversibleMigration
+  end
+end
diff --git a/lib/ai_helper/assistant.rb b/lib/ai_helper/assistant.rb
index aa360a7e..e95b2841 100644
--- a/lib/ai_helper/assistant.rb
+++ b/lib/ai_helper/assistant.rb
@@ -143,47 +143,26 @@ module DiscourseAi
       end
 
       def generate_image_caption(upload, user)
-        if SiteSetting.ai_helper_image_caption_model == "llava"
-          image_base64 =
-            DiscourseAi::Completions::UploadEncoder.encode(
-              upload_ids: [upload.id],
-              max_pixels: 1_048_576,
-            ).first[
-              :base64
-            ]
-          parameters = {
-            input: {
-              image: "data:image/#{upload.extension};base64, #{image_base64}",
-              top_p: 1,
-              max_tokens: 1024,
-              temperature: 0.2,
-              prompt: "Please describe this image in a single sentence",
-            },
-          }
-
-          ::DiscourseAi::Inference::Llava.perform!(parameters).dig(:output).join
-        else
-          prompt =
-            DiscourseAi::Completions::Prompt.new(
-              "You are a bot specializing in image captioning.",
-              messages: [
-                {
-                  type: :user,
-                  content:
-                    "Describe this image in a single sentence#{custom_locale_instructions(user)}",
-                  upload_ids: [upload.id],
-                },
-              ],
-              skip_validations: true,
-            )
-
-          DiscourseAi::Completions::Llm.proxy(SiteSetting.ai_helper_image_caption_model).generate(
-            prompt,
-            user: user,
-            max_tokens: 1024,
-            feature_name: "image_caption",
+        prompt =
+          DiscourseAi::Completions::Prompt.new(
+            "You are a bot specializing in image captioning.",
+            messages: [
+              {
+                type: :user,
+                content:
+                  "Describe this image in a single sentence#{custom_locale_instructions(user)}",
+                upload_ids: [upload.id],
+              },
+            ],
+            skip_validations: true,
           )
-        end
+
+        DiscourseAi::Completions::Llm.proxy(SiteSetting.ai_helper_image_caption_model).generate(
+          prompt,
+          user: user,
+          max_tokens: 1024,
+          feature_name: "image_caption",
+        )
       end
 
       private
diff --git a/lib/completions/dialects/chat_gpt.rb b/lib/completions/dialects/chat_gpt.rb
index c796dd75..c827c4cf 100644
--- a/lib/completions/dialects/chat_gpt.rb
+++ b/lib/completions/dialects/chat_gpt.rb
@@ -78,33 +78,25 @@ module DiscourseAi
             end
           end
 
-          user_message[:content] = inline_images(user_message[:content], msg)
+          user_message[:content] = inline_images(user_message[:content], msg) if vision_support?
           user_message
         end
 
         def inline_images(content, message)
-          if model_name.include?("gpt-4-vision") || model_name == "gpt-4-turbo" ||
-               model_name == "gpt-4o"
-            content = message[:content]
-            encoded_uploads = prompt.encoded_uploads(message)
-            if encoded_uploads.present?
-              new_content = []
-              new_content.concat(
-                encoded_uploads.map do |details|
-                  {
-                    type: "image_url",
-                    image_url: {
-                      url: "data:#{details[:mime_type]};base64,#{details[:base64]}",
-                    },
-                  }
-                end,
-              )
-              new_content << { type: "text", text: content }
-              content = new_content
-            end
-          end
+          encoded_uploads = prompt.encoded_uploads(message)
+          return content if encoded_uploads.blank?
 
-          content
+          content_w_imgs =
+            encoded_uploads.reduce([]) do |memo, details|
+              memo << {
+                type: "image_url",
+                image_url: {
+                  url: "data:#{details[:mime_type]};base64,#{details[:base64]}",
+                },
+              }
+            end
+
+          content_w_imgs << { type: "text", text: message[:content] }
         end
 
         def per_message_overhead
diff --git a/lib/completions/dialects/claude.rb b/lib/completions/dialects/claude.rb
index bf43d9b7..912a8369 100644
--- a/lib/completions/dialects/claude.rb
+++ b/lib/completions/dialects/claude.rb
@@ -109,34 +109,28 @@ module DiscourseAi
           content = +""
           content << "#{msg[:id]}: " if msg[:id]
           content << msg[:content]
-          content = inline_images(content, msg)
+          content = inline_images(content, msg) if vision_support?
 
           { role: "user", content: content }
         end
 
         def inline_images(content, message)
-          if model_name.include?("claude-3")
-            encoded_uploads = prompt.encoded_uploads(message)
-            if encoded_uploads.present?
-              new_content = []
-              new_content.concat(
-                encoded_uploads.map do |details|
-                  {
-                    source: {
-                      type: "base64",
-                      data: details[:base64],
-                      media_type: details[:mime_type],
-                    },
-                    type: "image",
-                  }
-                end,
-              )
-              new_content << { type: "text", text: content }
-              content = new_content
-            end
-          end
+          encoded_uploads = prompt.encoded_uploads(message)
+          return content if encoded_uploads.blank?
 
-          content
+          content_w_imgs =
+            encoded_uploads.reduce([]) do |memo, details|
+              memo << {
+                source: {
+                  type: "base64",
+                  data: details[:base64],
+                  media_type: details[:mime_type],
+                },
+                type: "image",
+              }
+            end
+
+          content_w_imgs << { type: "text", text: content }
         end
       end
     end
diff --git a/lib/completions/dialects/dialect.rb b/lib/completions/dialects/dialect.rb
index 7d49e396..33329794 100644
--- a/lib/completions/dialects/dialect.rb
+++ b/lib/completions/dialects/dialect.rb
@@ -56,6 +56,10 @@ module DiscourseAi
           false
         end
 
+        def vision_support?
+          llm_model&.vision_enabled?
+        end
+
         def tools
           @tools ||= tools_dialect.translated_tools
         end
diff --git a/lib/completions/dialects/gemini.rb b/lib/completions/dialects/gemini.rb
index a6e04c23..2fea546e 100644
--- a/lib/completions/dialects/gemini.rb
+++ b/lib/completions/dialects/gemini.rb
@@ -114,6 +114,8 @@ module DiscourseAi
           if beta_api?
             # support new format with multiple parts
             result = { role: "user", parts: [{ text: msg[:content] }] }
+            return result unless vision_support?
+
             upload_parts = uploaded_parts(msg)
             result[:parts].concat(upload_parts) if upload_parts.present?
             result
diff --git a/lib/completions/dialects/open_ai_compatible.rb b/lib/completions/dialects/open_ai_compatible.rb
index 5bf36bb2..ec91c49a 100644
--- a/lib/completions/dialects/open_ai_compatible.rb
+++ b/lib/completions/dialects/open_ai_compatible.rb
@@ -47,7 +47,28 @@ module DiscourseAi
           content << "#{msg[:id]}: " if msg[:id]
           content << msg[:content]
 
-          { role: "user", content: content }
+          message = { role: "user", content: content }
+
+          message[:content] = inline_images(message[:content], msg) if vision_support?
+
+          message
+        end
+
+        def inline_images(content, message)
+          encoded_uploads = prompt.encoded_uploads(message)
+          return content if encoded_uploads.blank?
+
+          content_w_imgs =
+            encoded_uploads.reduce([]) do |memo, details|
+              memo << {
+                type: "image_url",
+                image_url: {
+                  url: "data:#{details[:mime_type]};base64,#{details[:base64]}",
+                },
+              }
+            end
+
+          content_w_imgs << { type: "text", text: message[:content] }
         end
       end
     end
diff --git a/lib/completions/endpoints/canned_response.rb b/lib/completions/endpoints/canned_response.rb
index eae930ca..c5af2803 100644
--- a/lib/completions/endpoints/canned_response.rb
+++ b/lib/completions/endpoints/canned_response.rb
@@ -35,6 +35,8 @@ module DiscourseAi
                   "The number of completions you requested exceed the number of canned responses"
           end
 
+          raise response if response.is_a?(StandardError)
+
           @completions += 1
           if block_given?
             cancelled = false
diff --git a/lib/completions/llm.rb b/lib/completions/llm.rb
index c2660090..7d6489cb 100644
--- a/lib/completions/llm.rb
+++ b/lib/completions/llm.rb
@@ -89,15 +89,6 @@ module DiscourseAi
           DiscourseAi::Tokenizer::BasicTokenizer.available_llm_tokenizers.map(&:name)
         end
 
-        def vision_models_by_provider
-          @vision_models_by_provider ||= {
-            aws_bedrock: %w[claude-3-sonnet claude-3-opus claude-3-haiku],
-            anthropic: %w[claude-3-sonnet claude-3-opus claude-3-haiku],
-            open_ai: %w[gpt-4-vision-preview gpt-4-turbo gpt-4o],
-            google: %w[gemini-1.5-pro gemini-1.5-flash],
-          }
-        end
-
         def models_by_provider
           # ChatGPT models are listed under open_ai but they are actually available through OpenAI and Azure.
           # However, since they use the same URL/key settings, there's no reason to duplicate them.
diff --git a/lib/configuration/llm_vision_enumerator.rb b/lib/configuration/llm_vision_enumerator.rb
index f57f4587..c4cf1a62 100644
--- a/lib/configuration/llm_vision_enumerator.rb
+++ b/lib/configuration/llm_vision_enumerator.rb
@@ -10,24 +10,15 @@ module DiscourseAi
       end
 
       def self.values
-        begin
-          result =
-            DiscourseAi::Completions::Llm.vision_models_by_provider.flat_map do |provider, models|
-              endpoint = DiscourseAi::Completions::Endpoints::Base.endpoint_for(provider.to_s)
+        values = DB.query_hash(<<~SQL).map(&:symbolize_keys)
+          SELECT display_name AS name, id AS value
+          FROM llm_models
+          WHERE vision_enabled
+        SQL
 
-              models.map do |model_name|
-                { name: endpoint.display_name(model_name), value: "#{provider}:#{model_name}" }
-              end
-            end
+        values.each { |value_h| value_h[:value] = "custom:#{value_h[:value]}" }
 
-          result << { name: "Llava", value: "llava" }
-
-          result
-          # TODO add support for LlmModel as well
-          # LlmModel.all.each do |model|
-          #  llm_models << { name: model.display_name, value: "custom:#{model.id}" }
-          # end
-        end
+        values
       end
     end
   end
diff --git a/lib/inference/llava.rb b/lib/inference/llava.rb
deleted file mode 100644
index 3ca8a341..00000000
--- a/lib/inference/llava.rb
+++ /dev/null
@@ -1,31 +0,0 @@
-# frozen_string_literal: true
-
-module ::DiscourseAi
-  module Inference
-    class Llava
-      def self.perform!(content)
-        headers = { "Referer" => Discourse.base_url, "Content-Type" => "application/json" }
-        body = content.to_json
-
-        if SiteSetting.ai_llava_endpoint_srv.present?
-          service = DiscourseAi::Utils::DnsSrv.lookup(SiteSetting.ai_llava_endpoint_srv)
-          api_endpoint = "https://#{service.target}:#{service.port}"
-        else
-          api_endpoint = SiteSetting.ai_llava_endpoint
-        end
-
-        headers["X-API-KEY"] = SiteSetting.ai_llava_api_key if SiteSetting.ai_llava_api_key.present?
-
-        response = Faraday.post("#{api_endpoint}/predictions", body, headers)
-
-        raise Net::HTTPBadResponse if ![200].include?(response.status)
-
-        JSON.parse(response.body, symbolize_names: true)
-      end
-
-      def self.configured?
-        SiteSetting.ai_llava_endpoint.present? || SiteSetting.ai_llava_endpoint_srv.present?
-      end
-    end
-  end
-end
diff --git a/spec/lib/completions/endpoints/anthropic_spec.rb b/spec/lib/completions/endpoints/anthropic_spec.rb
index 0c47f0e8..dfecbe1f 100644
--- a/spec/lib/completions/endpoints/anthropic_spec.rb
+++ b/spec/lib/completions/endpoints/anthropic_spec.rb
@@ -2,7 +2,18 @@
 require_relative "endpoint_compliance"
 
 RSpec.describe DiscourseAi::Completions::Endpoints::Anthropic do
-  let(:llm) { DiscourseAi::Completions::Llm.proxy("anthropic:claude-3-opus") }
+  let(:url) { "https://api.anthropic.com/v1/messages" }
+  fab!(:model) do
+    Fabricate(
+      :llm_model,
+      url: "https://api.anthropic.com/v1/messages",
+      name: "claude-3-opus",
+      provider: "anthropic",
+      api_key: "123",
+      vision_enabled: true,
+    )
+  end
+  let(:llm) { DiscourseAi::Completions::Llm.proxy("custom:#{model.id}") }
   let(:image100x100) { plugin_file_from_fixtures("100x100.jpg") }
   let(:upload100x100) do
     UploadCreator.new(image100x100, "image.jpg").create_for(Discourse.system_user.id)
@@ -45,8 +56,6 @@ RSpec.describe DiscourseAi::Completions::Endpoints::Anthropic do
     prompt_with_tools
   end
 
-  before { SiteSetting.ai_anthropic_api_key = "123" }
-
   it "does not eat spaces with tool calls" do
     SiteSetting.ai_anthropic_native_tool_call_models = "claude-3-opus"
     body = <<~STRING
@@ -108,10 +117,7 @@ RSpec.describe DiscourseAi::Completions::Endpoints::Anthropic do
     result = +""
     body = body.scan(/.*\n/)
     EndpointMock.with_chunk_array_support do
-      stub_request(:post, "https://api.anthropic.com/v1/messages").to_return(
-        status: 200,
-        body: body,
-      )
+      stub_request(:post, url).to_return(status: 200, body: body)
 
       llm.generate(prompt_with_google_tool, user: Discourse.system_user) do |partial|
         result << partial
@@ -161,7 +167,7 @@ RSpec.describe DiscourseAi::Completions::Endpoints::Anthropic do
 
     parsed_body = nil
 
-    stub_request(:post, "https://api.anthropic.com/v1/messages").with(
+    stub_request(:post, url).with(
       body:
         proc do |req_body|
           parsed_body = JSON.parse(req_body, symbolize_names: true)
@@ -244,7 +250,7 @@ RSpec.describe DiscourseAi::Completions::Endpoints::Anthropic do
       },
     }.to_json
 
-    stub_request(:post, "https://api.anthropic.com/v1/messages").to_return(body: body)
+    stub_request(:post, url).to_return(body: body)
 
     result = proxy.generate(prompt, user: Discourse.system_user)
 
@@ -314,7 +320,7 @@ RSpec.describe DiscourseAi::Completions::Endpoints::Anthropic do
     STRING
 
     requested_body = nil
-    stub_request(:post, "https://api.anthropic.com/v1/messages").with(
+    stub_request(:post, url).with(
       body:
         proc do |req_body|
           requested_body = JSON.parse(req_body, symbolize_names: true)
@@ -351,7 +357,7 @@ RSpec.describe DiscourseAi::Completions::Endpoints::Anthropic do
     STRING
 
     parsed_body = nil
-    stub_request(:post, "https://api.anthropic.com/v1/messages").with(
+    stub_request(:post, url).with(
       body:
         proc do |req_body|
           parsed_body = JSON.parse(req_body, symbolize_names: true)
diff --git a/spec/lib/completions/endpoints/gemini_spec.rb b/spec/lib/completions/endpoints/gemini_spec.rb
index dfa0d7fc..bcd733f6 100644
--- a/spec/lib/completions/endpoints/gemini_spec.rb
+++ b/spec/lib/completions/endpoints/gemini_spec.rb
@@ -130,6 +130,17 @@ end
 RSpec.describe DiscourseAi::Completions::Endpoints::Gemini do
   subject(:endpoint) { described_class.new("gemini-pro", DiscourseAi::Tokenizer::OpenAiTokenizer) }
 
+  fab!(:model) do
+    Fabricate(
+      :llm_model,
+      url: "https://generativelanguage.googleapis.com/v1beta/models/gemini-1.5-pro-latest",
+      name: "gemini-1.5-pro",
+      provider: "google",
+      api_key: "ABC",
+      vision_enabled: true,
+    )
+  end
+
   fab!(:user)
 
   let(:image100x100) { plugin_file_from_fixtures("100x100.jpg") }
@@ -144,8 +155,6 @@ RSpec.describe DiscourseAi::Completions::Endpoints::Gemini do
   end
 
   it "Supports Vision API" do
-    SiteSetting.ai_gemini_api_key = "ABC"
-
     prompt =
       DiscourseAi::Completions::Prompt.new(
         "You are image bot",
@@ -158,9 +167,8 @@ RSpec.describe DiscourseAi::Completions::Endpoints::Gemini do
 
     req_body = nil
 
-    llm = DiscourseAi::Completions::Llm.proxy("google:gemini-1.5-pro")
-    url =
-      "https://generativelanguage.googleapis.com/v1beta/models/gemini-1.5-pro-latest:generateContent?key=ABC"
+    llm = DiscourseAi::Completions::Llm.proxy("custom:#{model.id}")
+    url = "#{model.url}:generateContent?key=ABC"
 
     stub_request(:post, url).with(
       body:
@@ -202,8 +210,6 @@ RSpec.describe DiscourseAi::Completions::Endpoints::Gemini do
   end
 
   it "Can correctly handle streamed responses even if they are chunked badly" do
-    SiteSetting.ai_gemini_api_key = "ABC"
-
     data = +""
     data << "da|ta: |"
     data << gemini_mock.response("Hello").to_json
@@ -214,9 +220,8 @@ RSpec.describe DiscourseAi::Completions::Endpoints::Gemini do
 
     split = data.split("|")
 
-    llm = DiscourseAi::Completions::Llm.proxy("google:gemini-1.5-flash")
-    url =
-      "https://generativelanguage.googleapis.com/v1beta/models/gemini-1.5-flash-latest:streamGenerateContent?alt=sse&key=ABC"
+    llm = DiscourseAi::Completions::Llm.proxy("custom:#{model.id}")
+    url = "#{model.url}:streamGenerateContent?alt=sse&key=ABC"
 
     output = +""
     gemini_mock.with_chunk_array_support do
diff --git a/spec/lib/completions/endpoints/open_ai_spec.rb b/spec/lib/completions/endpoints/open_ai_spec.rb
index b4d942f2..6703d7ec 100644
--- a/spec/lib/completions/endpoints/open_ai_spec.rb
+++ b/spec/lib/completions/endpoints/open_ai_spec.rb
@@ -258,7 +258,8 @@ RSpec.describe DiscourseAi::Completions::Endpoints::OpenAi do
 
   describe "image support" do
     it "can handle images" do
-      llm = DiscourseAi::Completions::Llm.proxy("open_ai:gpt-4-turbo")
+      model = Fabricate(:llm_model, provider: "open_ai", vision_enabled: true)
+      llm = DiscourseAi::Completions::Llm.proxy("custom:#{model.id}")
       prompt =
         DiscourseAi::Completions::Prompt.new(
           "You are image bot",
diff --git a/spec/requests/ai_helper/assistant_controller_spec.rb b/spec/requests/ai_helper/assistant_controller_spec.rb
index f119f75d..d6c00bb6 100644
--- a/spec/requests/ai_helper/assistant_controller_spec.rb
+++ b/spec/requests/ai_helper/assistant_controller_spec.rb
@@ -112,43 +112,40 @@ RSpec.describe DiscourseAi::AiHelper::AssistantController do
       "A picture of a cat sitting on a table (#{I18n.t("discourse_ai.ai_helper.image_caption.attribution")})"
     end
 
+    before { assign_fake_provider_to(:ai_helper_image_caption_model) }
+
+    def request_caption(params)
+      DiscourseAi::Completions::Llm.with_prepared_responses([caption]) do
+        post "/discourse-ai/ai-helper/caption_image", params: params
+
+        yield(response)
+      end
+    end
+
     context "when logged in as an allowed user" do
       fab!(:user) { Fabricate(:user, refresh_auto_groups: true) }
 
       before do
         sign_in(user)
-        SiteSetting.ai_helper_allowed_groups = Group::AUTO_GROUPS[:trust_level_1]
-        SiteSetting.ai_llava_endpoint = "https://example.com"
 
-        stub_request(:post, "https://example.com/predictions").to_return(
-          status: 200,
-          body: { output: caption.gsub(" ", " |").split("|") }.to_json,
-        )
+        SiteSetting.ai_helper_allowed_groups = Group::AUTO_GROUPS[:trust_level_1]
       end
 
       it "returns the suggested caption for the image" do
-        post "/discourse-ai/ai-helper/caption_image",
-             params: {
-               image_url: image_url,
-               image_url_type: "long_url",
-             }
-
-        expect(response.status).to eq(200)
-        expect(response.parsed_body["caption"]).to eq(caption_with_attrs)
+        request_caption({ image_url: image_url, image_url_type: "long_url" }) do |r|
+          expect(r.status).to eq(200)
+          expect(r.parsed_body["caption"]).to eq(caption_with_attrs)
+        end
       end
 
       context "when the image_url is a short_url" do
         let(:image_url) { upload.short_url }
 
         it "returns the suggested caption for the image" do
-          post "/discourse-ai/ai-helper/caption_image",
-               params: {
-                 image_url: image_url,
-                 image_url_type: "short_url",
-               }
-
-          expect(response.status).to eq(200)
-          expect(response.parsed_body["caption"]).to eq(caption_with_attrs)
+          request_caption({ image_url: image_url, image_url_type: "short_url" }) do |r|
+            expect(r.status).to eq(200)
+            expect(r.parsed_body["caption"]).to eq(caption_with_attrs)
+          end
         end
       end
 
@@ -156,27 +153,25 @@ RSpec.describe DiscourseAi::AiHelper::AssistantController do
         let(:image_url) { "#{Discourse.base_url}#{upload.short_path}" }
 
         it "returns the suggested caption for the image" do
-          post "/discourse-ai/ai-helper/caption_image",
-               params: {
-                 image_url: image_url,
-                 image_url_type: "short_path",
-               }
-
-          expect(response.status).to eq(200)
-          expect(response.parsed_body["caption"]).to eq(caption_with_attrs)
+          request_caption({ image_url: image_url, image_url_type: "short_path" }) do |r|
+            expect(r.status).to eq(200)
+            expect(r.parsed_body["caption"]).to eq(caption_with_attrs)
+          end
         end
       end
 
       it "returns a 502 error when the completion call fails" do
-        stub_request(:post, "https://example.com/predictions").to_return(status: 502)
+        DiscourseAi::Completions::Llm.with_prepared_responses(
+          [DiscourseAi::Completions::Endpoints::Base::CompletionFailed.new],
+        ) do
+          post "/discourse-ai/ai-helper/caption_image",
+               params: {
+                 image_url: image_url,
+                 image_url_type: "long_url",
+               }
 
-        post "/discourse-ai/ai-helper/caption_image",
-             params: {
-               image_url: image_url,
-               image_url_type: "long_url",
-             }
-
-        expect(response.status).to eq(502)
+          expect(response.status).to eq(502)
+        end
       end
 
       it "returns a 400 error when the image_url is blank" do
@@ -211,9 +206,10 @@ RSpec.describe DiscourseAi::AiHelper::AssistantController do
           SiteSetting.provider = SiteSettings::DbProvider.new(SiteSetting)
           setup_s3
           stub_s3_store
+          assign_fake_provider_to(:ai_helper_image_caption_model)
           SiteSetting.secure_uploads = true
           SiteSetting.ai_helper_allowed_groups = Group::AUTO_GROUPS[:trust_level_1]
-          SiteSetting.ai_llava_endpoint = "https://example.com"
+
           Group.find(SiteSetting.ai_helper_allowed_groups_map.first).add(user)
           user.reload
 
@@ -242,14 +238,11 @@ RSpec.describe DiscourseAi::AiHelper::AssistantController do
 
         it "returns a 200 message and caption if user can access the secure upload" do
           group.add(user)
-          post "/discourse-ai/ai-helper/caption_image",
-               params: {
-                 image_url: image_url,
-                 image_url_type: "long_url",
-               }
 
-          expect(response.status).to eq(200)
-          expect(response.parsed_body["caption"]).to eq(caption_with_attrs)
+          request_caption({ image_url: image_url, image_url_type: "long_url" }) do |r|
+            expect(r.status).to eq(200)
+            expect(r.parsed_body["caption"]).to eq(caption_with_attrs)
+          end
         end
 
         context "if the input URL is for a secure upload but not on the secure-uploads path" do
@@ -257,13 +250,11 @@ RSpec.describe DiscourseAi::AiHelper::AssistantController do
 
           it "creates a signed URL properly and makes the caption" do
             group.add(user)
-            post "/discourse-ai/ai-helper/caption_image",
-                 params: {
-                   image_url: image_url,
-                   image_url_type: "long_url",
-                 }
-            expect(response.status).to eq(200)
-            expect(response.parsed_body["caption"]).to eq(caption_with_attrs)
+
+            request_caption({ image_url: image_url, image_url_type: "long_url" }) do |r|
+              expect(r.status).to eq(200)
+              expect(r.parsed_body["caption"]).to eq(caption_with_attrs)
+            end
           end
         end
       end
diff --git a/spec/system/ai_helper/ai_image_caption_spec.rb b/spec/system/ai_helper/ai_image_caption_spec.rb
index c0f18d5c..82dad656 100644
--- a/spec/system/ai_helper/ai_image_caption_spec.rb
+++ b/spec/system/ai_helper/ai_image_caption_spec.rb
@@ -21,14 +21,9 @@ RSpec.describe "AI image caption", type: :system, js: true do
   before do
     Group.find_by(id: Group::AUTO_GROUPS[:admins]).add(user)
     assign_fake_provider_to(:ai_helper_model)
-    SiteSetting.ai_llava_endpoint = "https://example.com"
+    assign_fake_provider_to(:ai_helper_image_caption_model)
     SiteSetting.ai_helper_enabled_features = "image_caption"
     sign_in(user)
-
-    stub_request(:post, "https://example.com/predictions").to_return(
-      status: 200,
-      body: { output: caption.gsub(" ", " |").split("|") }.to_json,
-    )
   end
 
   shared_examples "shows no image caption button" do
@@ -53,35 +48,41 @@ RSpec.describe "AI image caption", type: :system, js: true do
 
   context "when triggering caption with AI on desktop" do
     it "should show an image caption in an input field" do
-      visit("/latest")
-      page.find("#create-topic").click
-      attach_file([file_path]) { composer.click_toolbar_button("upload") }
-      popup.click_generate_caption
-      expect(popup.has_caption_popup_value?(caption_with_attrs)).to eq(true)
-      popup.save_caption
-      wait_for { page.find(".image-wrapper img")["alt"] == caption_with_attrs }
-      expect(page.find(".image-wrapper img")["alt"]).to eq(caption_with_attrs)
+      DiscourseAi::Completions::Llm.with_prepared_responses([caption]) do
+        visit("/latest")
+        page.find("#create-topic").click
+        attach_file([file_path]) { composer.click_toolbar_button("upload") }
+        popup.click_generate_caption
+        expect(popup.has_caption_popup_value?(caption_with_attrs)).to eq(true)
+        popup.save_caption
+        wait_for { page.find(".image-wrapper img")["alt"] == caption_with_attrs }
+        expect(page.find(".image-wrapper img")["alt"]).to eq(caption_with_attrs)
+      end
     end
 
     it "should allow you to cancel a caption request" do
-      visit("/latest")
-      page.find("#create-topic").click
-      attach_file([file_path]) { composer.click_toolbar_button("upload") }
-      popup.click_generate_caption
-      popup.cancel_caption
-      expect(popup).to have_no_disabled_generate_button
+      DiscourseAi::Completions::Llm.with_prepared_responses([caption]) do
+        visit("/latest")
+        page.find("#create-topic").click
+        attach_file([file_path]) { composer.click_toolbar_button("upload") }
+        popup.click_generate_caption
+        popup.cancel_caption
+        expect(popup).to have_no_disabled_generate_button
+      end
     end
   end
 
   context "when triggering caption with AI on mobile", mobile: true do
     it "should show update the image alt text with the caption" do
-      visit("/latest")
-      page.find("#create-topic").click
-      attach_file([file_path]) { page.find(".mobile-file-upload").click }
-      page.find(".mobile-preview").click
-      popup.click_generate_caption
-      wait_for { page.find(".image-wrapper img")["alt"] == caption_with_attrs }
-      expect(page.find(".image-wrapper img")["alt"]).to eq(caption_with_attrs)
+      DiscourseAi::Completions::Llm.with_prepared_responses([caption]) do
+        visit("/latest")
+        page.find("#create-topic").click
+        attach_file([file_path]) { page.find(".mobile-file-upload").click }
+        page.find(".mobile-preview").click
+        popup.click_generate_caption
+        wait_for { page.find(".image-wrapper img")["alt"] == caption_with_attrs }
+        expect(page.find(".image-wrapper img")["alt"]).to eq(caption_with_attrs)
+      end
     end
   end
 
@@ -125,15 +126,17 @@ RSpec.describe "AI image caption", type: :system, js: true do
       end
 
       it "should auto caption the existing images and update the preference when dialog is accepted" do
-        visit("/latest")
-        page.find("#create-topic").click
-        attach_file([file_path]) { composer.click_toolbar_button("upload") }
-        wait_for { composer.has_no_in_progress_uploads? }
-        composer.fill_title("I love using Discourse! It is my favorite forum software")
-        composer.create
-        dialog.click_yes
-        wait_for(timeout: 100) { page.find("#post_1 .cooked img")["alt"] == caption_with_attrs }
-        expect(page.find("#post_1 .cooked img")["alt"]).to eq(caption_with_attrs)
+        DiscourseAi::Completions::Llm.with_prepared_responses([caption]) do
+          visit("/latest")
+          page.find("#create-topic").click
+          attach_file([file_path]) { composer.click_toolbar_button("upload") }
+          wait_for { composer.has_no_in_progress_uploads? }
+          composer.fill_title("I love using Discourse! It is my favorite forum software")
+          composer.create
+          dialog.click_yes
+          wait_for(timeout: 100) { page.find("#post_1 .cooked img")["alt"] == caption_with_attrs }
+          expect(page.find("#post_1 .cooked img")["alt"]).to eq(caption_with_attrs)
+        end
       end
     end
 
@@ -142,14 +145,16 @@ RSpec.describe "AI image caption", type: :system, js: true do
 
       skip "TODO: Fix auto_image_caption user option not present in testing environment?" do
         it "should auto caption the image after uploading" do
-          visit("/latest")
-          page.find("#create-topic").click
-          attach_file([Rails.root.join("spec/fixtures/images/logo.jpg")]) do
-            composer.click_toolbar_button("upload")
+          DiscourseAi::Completions::Llm.with_prepared_responses([caption]) do
+            visit("/latest")
+            page.find("#create-topic").click
+            attach_file([Rails.root.join("spec/fixtures/images/logo.jpg")]) do
+              composer.click_toolbar_button("upload")
+            end
+            wait_for { composer.has_no_in_progress_uploads? }
+            wait_for { page.find(".image-wrapper img")["alt"] == caption_with_attrs }
+            expect(page.find(".image-wrapper img")["alt"]).to eq(caption_with_attrs)
           end
-          wait_for { composer.has_no_in_progress_uploads? }
-          wait_for { page.find(".image-wrapper img")["alt"] == caption_with_attrs }
-          expect(page.find(".image-wrapper img")["alt"]).to eq(caption_with_attrs)
         end
       end
     end