FEATURE: Track if a model can do vision in the llm_models table (#725)

* FEATURE: Track if a model can do vision in the llm_models table * Data migration
2024-07-24 16:29:47 -03:00 · 2024-07-24 16:29:47 -03:00 · 5c196bca89
parent 06e239321b
commit 5c196bca89
25 changed files with 289 additions and 263 deletions
--- a/app/controllers/discourse_ai/admin/ai_llms_controller.rb
+++ b/app/controllers/discourse_ai/admin/ai_llms_controller.rb
@ -106,6 +106,7 @@ module DiscourseAi
            :max_prompt_tokens,
            :api_key,
            :enabled_chat_bot,
            :vision_enabled,
          )
        provider = updating ? updating.provider : permitted[:provider]
--- a/app/models/llm_model.rb
+++ b/app/models/llm_model.rb
@ -124,4 +124,6 @@ end
 #  api_key           :string
 #  user_id           :integer
 #  enabled_chat_bot  :boolean          default(FALSE), not null
 #  provider_params   :jsonb
 #  vision_enabled    :boolean          default(FALSE), not null
 #
--- a/app/serializers/llm_model_serializer.rb
+++ b/app/serializers/llm_model_serializer.rb
@ -13,7 +13,8 @@ class LlmModelSerializer < ApplicationSerializer
             :url,
             :enabled_chat_bot,
             :shadowed_by_srv,
-             :provider_params
+             :provider_params,
             :vision_enabled
  has_one :user, serializer: BasicUserSerializer, embed: :object
--- a/assets/javascripts/discourse/admin/models/ai-llm.js
+++ b/assets/javascripts/discourse/admin/models/ai-llm.js
@ -13,7 +13,8 @@ export default class AiLlm extends RestModel {
      "url",
      "api_key",
      "enabled_chat_bot",
-      "provider_params"
+      "provider_params",
      "vision_enabled"
    );
  }
--- a/assets/javascripts/discourse/components/ai-llm-editor-form.gjs
+++ b/assets/javascripts/discourse/components/ai-llm-editor-form.gjs
@ -267,6 +267,14 @@ export default class AiLlmEditorForm extends Component {
          @content={{I18n.t "discourse_ai.llms.hints.max_prompt_tokens"}}
        />
      </div>
      <div class="control-group ai-llm-editor__vision-enabled">
        <Input @type="checkbox" @checked={{@model.vision_enabled}} />
        <label>{{I18n.t "discourse_ai.llms.vision_enabled"}}</label>
        <DTooltip
          @icon="question-circle"
          @content={{I18n.t "discourse_ai.llms.hints.vision_enabled"}}
        />
      </div>
      <div class="control-group">
        <DToggleSwitch
          class="ai-llm-editor__enabled-chat-bot"
--- a/assets/stylesheets/modules/llms/common/ai-llms-editor.scss
+++ b/assets/stylesheets/modules/llms/common/ai-llms-editor.scss
@ -41,4 +41,9 @@
    display: flex;
    align-items: center;
  }
  &__vision-enabled {
    display: flex;
    align-items: flex-start;
  }
 }
--- a/config/locales/client.en.yml
+++ b/config/locales/client.en.yml
@ -228,6 +228,7 @@ en:
        url: "URL of the service hosting the model"
        api_key: "API Key of the service hosting the model"
        enabled_chat_bot: "Allow AI Bot"
        vision_enabled: "Vision enabled"
        ai_bot_user: "AI Bot User"
        save: "Save"
        edit: "Edit"
@ -252,6 +253,7 @@ en:
        hints:
          max_prompt_tokens: "Max numbers of tokens for the prompt. As a rule of thumb, this should be 50% of the model's context window."
          name: "We include this in the API call to specify which model we'll use."
          vision_enabled: "If enabled, the AI will attempt to understand images. It depends on the model being used supporting vision. Supported by latest models from Anthropic, Google, and OpenAI."
        providers:
          aws_bedrock: "AWS Bedrock"
--- a/config/settings.yml
+++ b/config/settings.yml
@ -189,10 +189,13 @@ discourse_ai:
  ai_vllm_api_key: ""
  ai_llava_endpoint:
    default: ""
    hidden: true
  ai_llava_endpoint_srv:
    default: ""
    hidden: true
-  ai_llava_api_key: ""
+  ai_llava_api_key: 
    default: ""
    hidden: true
  ai_strict_token_counting:
    default: false
    hidden: true
@ -254,7 +257,7 @@ discourse_ai:
      - "context_menu"
      - "image_caption"
  ai_helper_image_caption_model:
-    default: "llava"
+    default: ""
    type: enum
    enum: "DiscourseAi::Configuration::LlmVisionEnumerator"
  ai_auto_image_caption_allowed_groups:
--- a/db/migrate/20240719143453_llm_model_vision_enabled.rb
+++ b/db/migrate/20240719143453_llm_model_vision_enabled.rb
@ -0,0 +1,6 @@
 # frozen_string_literal: true
 class LlmModelVisionEnabled < ActiveRecord::Migration[7.1]
  def change
    add_column :llm_models, :vision_enabled, :boolean, default: false, null: false
  end
 end
--- a/db/post_migrate/20240724174343_migrate_vision_llms.rb
+++ b/db/post_migrate/20240724174343_migrate_vision_llms.rb
@ -0,0 +1,44 @@
 # frozen_string_literal: true
 class MigrateVisionLlms < ActiveRecord::Migration[7.1]
  def up
    vision_models = %w[
      claude-3-sonnet
      claude-3-opus
      claude-3-haiku
      gpt-4-vision-preview
      gpt-4-turbo
      gpt-4o
      gemini-1.5-pro
      gemini-1.5-flash
    ]
    DB.exec(<<~SQL, names: vision_models)
      UPDATE llm_models
      SET vision_enabled = true
      WHERE name IN (:names)
    SQL
    current_value =
      DB.query_single(
        "SELECT value FROM site_settings WHERE name = :setting_name",
        setting_name: "ai_helper_image_caption_model",
      ).first
    if current_value && current_value != "llava"
      llm_model =
        DB.query_single("SELECT id FROM llm_models WHERE name = :model", model: current_value).first
      if llm_model
        DB.exec(<<~SQL, new: "custom:#{llm_model}") if llm_model
          UPDATE site_settings
          SET value = :new
          WHERE name = 'ai_helper_image_caption_model'
        SQL
      end
    end
  end
  def down
    raise ActiveRecord::IrreversibleMigration
  end
 end
--- a/lib/ai_helper/assistant.rb
+++ b/lib/ai_helper/assistant.rb
@ -143,47 +143,26 @@ module DiscourseAi
      end
      def generate_image_caption(upload, user)
-        if SiteSetting.ai_helper_image_caption_model == "llava"
+        prompt =
-          image_base64 =
+          DiscourseAi::Completions::Prompt.new(
-            DiscourseAi::Completions::UploadEncoder.encode(
+            "You are a bot specializing in image captioning.",
-              upload_ids: [upload.id],
+            messages: [
-              max_pixels: 1_048_576,
+              {
-            ).first[
+                type: :user,
-              :base64
+                content:
-            ]
+                  "Describe this image in a single sentence#{custom_locale_instructions(user)}",
-          parameters = {
+                upload_ids: [upload.id],
-            input: {
+              },
-              image: "data:image/#{upload.extension};base64, #{image_base64}",
+            ],
-              top_p: 1,
+            skip_validations: true,
              max_tokens: 1024,
              temperature: 0.2,
              prompt: "Please describe this image in a single sentence",
            },
          }
          ::DiscourseAi::Inference::Llava.perform!(parameters).dig(:output).join
        else
          prompt =
            DiscourseAi::Completions::Prompt.new(
              "You are a bot specializing in image captioning.",
              messages: [
                {
                  type: :user,
                  content:
                    "Describe this image in a single sentence#{custom_locale_instructions(user)}",
                  upload_ids: [upload.id],
                },
              ],
              skip_validations: true,
            )
          DiscourseAi::Completions::Llm.proxy(SiteSetting.ai_helper_image_caption_model).generate(
            prompt,
            user: user,
            max_tokens: 1024,
            feature_name: "image_caption",
          )
-        end
+
        DiscourseAi::Completions::Llm.proxy(SiteSetting.ai_helper_image_caption_model).generate(
          prompt,
          user: user,
          max_tokens: 1024,
          feature_name: "image_caption",
        )
      end
      private
--- a/lib/completions/dialects/chat_gpt.rb
+++ b/lib/completions/dialects/chat_gpt.rb
@ -78,33 +78,25 @@ module DiscourseAi
            end
          end
-          user_message[:content] = inline_images(user_message[:content], msg)
+          user_message[:content] = inline_images(user_message[:content], msg) if vision_support?
          user_message
        end
        def inline_images(content, message)
-          if model_name.include?("gpt-4-vision") || model_name == "gpt-4-turbo" ||
+          encoded_uploads = prompt.encoded_uploads(message)
-               model_name == "gpt-4o"
+          return content if encoded_uploads.blank?
            content = message[:content]
            encoded_uploads = prompt.encoded_uploads(message)
            if encoded_uploads.present?
              new_content = []
              new_content.concat(
                encoded_uploads.map do |details|
                  {
                    type: "image_url",
                    image_url: {
                      url: "data:#{details[:mime_type]};base64,#{details[:base64]}",
                    },
                  }
                end,
              )
              new_content << { type: "text", text: content }
              content = new_content
            end
          end
-          content
+          content_w_imgs =
            encoded_uploads.reduce([]) do |memo, details|
              memo << {
                type: "image_url",
                image_url: {
                  url: "data:#{details[:mime_type]};base64,#{details[:base64]}",
                },
              }
            end
          content_w_imgs << { type: "text", text: message[:content] }
        end
        def per_message_overhead
--- a/lib/completions/dialects/claude.rb
+++ b/lib/completions/dialects/claude.rb
@ -109,34 +109,28 @@ module DiscourseAi
          content = +""
          content << "#{msg[:id]}: " if msg[:id]
          content << msg[:content]
-          content = inline_images(content, msg)
+          content = inline_images(content, msg) if vision_support?
          { role: "user", content: content }
        end
        def inline_images(content, message)
-          if model_name.include?("claude-3")
+          encoded_uploads = prompt.encoded_uploads(message)
-            encoded_uploads = prompt.encoded_uploads(message)
+          return content if encoded_uploads.blank?
            if encoded_uploads.present?
              new_content = []
              new_content.concat(
                encoded_uploads.map do |details|
                  {
                    source: {
                      type: "base64",
                      data: details[:base64],
                      media_type: details[:mime_type],
                    },
                    type: "image",
                  }
                end,
              )
              new_content << { type: "text", text: content }
              content = new_content
            end
          end
-          content
+          content_w_imgs =
            encoded_uploads.reduce([]) do |memo, details|
              memo << {
                source: {
                  type: "base64",
                  data: details[:base64],
                  media_type: details[:mime_type],
                },
                type: "image",
              }
            end
          content_w_imgs << { type: "text", text: content }
        end
      end
    end
--- a/lib/completions/dialects/dialect.rb
+++ b/lib/completions/dialects/dialect.rb
@ -56,6 +56,10 @@ module DiscourseAi
          false
        end
        def vision_support?
          llm_model&.vision_enabled?
        end
        def tools
          @tools ||= tools_dialect.translated_tools
        end
--- a/lib/completions/dialects/gemini.rb
+++ b/lib/completions/dialects/gemini.rb
@ -114,6 +114,8 @@ module DiscourseAi
          if beta_api?
            # support new format with multiple parts
            result = { role: "user", parts: [{ text: msg[:content] }] }
            return result unless vision_support?
            upload_parts = uploaded_parts(msg)
            result[:parts].concat(upload_parts) if upload_parts.present?
            result
--- a/lib/completions/dialects/open_ai_compatible.rb
+++ b/lib/completions/dialects/open_ai_compatible.rb
@ -47,7 +47,28 @@ module DiscourseAi
          content << "#{msg[:id]}: " if msg[:id]
          content << msg[:content]
-          { role: "user", content: content }
+          message = { role: "user", content: content }
          message[:content] = inline_images(message[:content], msg) if vision_support?
          message
        end
        def inline_images(content, message)
          encoded_uploads = prompt.encoded_uploads(message)
          return content if encoded_uploads.blank?
          content_w_imgs =
            encoded_uploads.reduce([]) do |memo, details|
              memo << {
                type: "image_url",
                image_url: {
                  url: "data:#{details[:mime_type]};base64,#{details[:base64]}",
                },
              }
            end
          content_w_imgs << { type: "text", text: message[:content] }
        end
      end
    end
--- a/lib/completions/endpoints/canned_response.rb
+++ b/lib/completions/endpoints/canned_response.rb
@ -35,6 +35,8 @@ module DiscourseAi
                  "The number of completions you requested exceed the number of canned responses"
          end
          raise response if response.is_a?(StandardError)
          @completions += 1
          if block_given?
            cancelled = false
--- a/lib/completions/llm.rb
+++ b/lib/completions/llm.rb
@ -89,15 +89,6 @@ module DiscourseAi
          DiscourseAi::Tokenizer::BasicTokenizer.available_llm_tokenizers.map(&:name)
        end
        def vision_models_by_provider
          @vision_models_by_provider ||= {
            aws_bedrock: %w[claude-3-sonnet claude-3-opus claude-3-haiku],
            anthropic: %w[claude-3-sonnet claude-3-opus claude-3-haiku],
            open_ai: %w[gpt-4-vision-preview gpt-4-turbo gpt-4o],
            google: %w[gemini-1.5-pro gemini-1.5-flash],
          }
        end
        def models_by_provider
          # ChatGPT models are listed under open_ai but they are actually available through OpenAI and Azure.
          # However, since they use the same URL/key settings, there's no reason to duplicate them.
--- a/lib/configuration/llm_vision_enumerator.rb
+++ b/lib/configuration/llm_vision_enumerator.rb
@ -10,24 +10,15 @@ module DiscourseAi
      end
      def self.values
-        begin
+        values = DB.query_hash(<<~SQL).map(&:symbolize_keys)
-          result =
+          SELECT display_name AS name, id AS value
-            DiscourseAi::Completions::Llm.vision_models_by_provider.flat_map do |provider, models|
+          FROM llm_models
-              endpoint = DiscourseAi::Completions::Endpoints::Base.endpoint_for(provider.to_s)
+          WHERE vision_enabled
        SQL
-              models.map do |model_name|
+        values.each { |value_h| value_h[:value] = "custom:#{value_h[:value]}" }
                { name: endpoint.display_name(model_name), value: "#{provider}:#{model_name}" }
              end
            end
-          result << { name: "Llava", value: "llava" }
+        values
          result
          # TODO add support for LlmModel as well
          # LlmModel.all.each do |model|
          #  llm_models << { name: model.display_name, value: "custom:#{model.id}" }
          # end
        end
      end
    end
  end
--- a/lib/inference/llava.rb
+++ b/lib/inference/llava.rb
@ -1,31 +0,0 @@
 # frozen_string_literal: true
 module ::DiscourseAi
  module Inference
    class Llava
      def self.perform!(content)
        headers = { "Referer" => Discourse.base_url, "Content-Type" => "application/json" }
        body = content.to_json
        if SiteSetting.ai_llava_endpoint_srv.present?
          service = DiscourseAi::Utils::DnsSrv.lookup(SiteSetting.ai_llava_endpoint_srv)
          api_endpoint = "https://#{service.target}:#{service.port}"
        else
          api_endpoint = SiteSetting.ai_llava_endpoint
        end
        headers["X-API-KEY"] = SiteSetting.ai_llava_api_key if SiteSetting.ai_llava_api_key.present?
        response = Faraday.post("#{api_endpoint}/predictions", body, headers)
        raise Net::HTTPBadResponse if ![200].include?(response.status)
        JSON.parse(response.body, symbolize_names: true)
      end
      def self.configured?
        SiteSetting.ai_llava_endpoint.present? || SiteSetting.ai_llava_endpoint_srv.present?
      end
    end
  end
 end
--- a/spec/lib/completions/endpoints/anthropic_spec.rb
+++ b/spec/lib/completions/endpoints/anthropic_spec.rb
@ -2,7 +2,18 @@
 require_relative "endpoint_compliance"
 RSpec.describe DiscourseAi::Completions::Endpoints::Anthropic do
-  let(:llm) { DiscourseAi::Completions::Llm.proxy("anthropic:claude-3-opus") }
+  let(:url) { "https://api.anthropic.com/v1/messages" }
  fab!(:model) do
    Fabricate(
      :llm_model,
      url: "https://api.anthropic.com/v1/messages",
      name: "claude-3-opus",
      provider: "anthropic",
      api_key: "123",
      vision_enabled: true,
    )
  end
  let(:llm) { DiscourseAi::Completions::Llm.proxy("custom:#{model.id}") }
  let(:image100x100) { plugin_file_from_fixtures("100x100.jpg") }
  let(:upload100x100) do
    UploadCreator.new(image100x100, "image.jpg").create_for(Discourse.system_user.id)
@ -45,8 +56,6 @@ RSpec.describe DiscourseAi::Completions::Endpoints::Anthropic do
    prompt_with_tools
  end
  before { SiteSetting.ai_anthropic_api_key = "123" }
  it "does not eat spaces with tool calls" do
    SiteSetting.ai_anthropic_native_tool_call_models = "claude-3-opus"
    body = <<~STRING
@ -108,10 +117,7 @@ RSpec.describe DiscourseAi::Completions::Endpoints::Anthropic do
    result = +""
    body = body.scan(/.*\n/)
    EndpointMock.with_chunk_array_support do
-      stub_request(:post, "https://api.anthropic.com/v1/messages").to_return(
+      stub_request(:post, url).to_return(status: 200, body: body)
        status: 200,
        body: body,
      )
      llm.generate(prompt_with_google_tool, user: Discourse.system_user) do |partial|
        result << partial
@ -161,7 +167,7 @@ RSpec.describe DiscourseAi::Completions::Endpoints::Anthropic do
    parsed_body = nil
-    stub_request(:post, "https://api.anthropic.com/v1/messages").with(
+    stub_request(:post, url).with(
      body:
        proc do |req_body|
          parsed_body = JSON.parse(req_body, symbolize_names: true)
@ -244,7 +250,7 @@ RSpec.describe DiscourseAi::Completions::Endpoints::Anthropic do
      },
    }.to_json
-    stub_request(:post, "https://api.anthropic.com/v1/messages").to_return(body: body)
+    stub_request(:post, url).to_return(body: body)
    result = proxy.generate(prompt, user: Discourse.system_user)
@ -314,7 +320,7 @@ RSpec.describe DiscourseAi::Completions::Endpoints::Anthropic do
    STRING
    requested_body = nil
-    stub_request(:post, "https://api.anthropic.com/v1/messages").with(
+    stub_request(:post, url).with(
      body:
        proc do |req_body|
          requested_body = JSON.parse(req_body, symbolize_names: true)
@ -351,7 +357,7 @@ RSpec.describe DiscourseAi::Completions::Endpoints::Anthropic do
    STRING
    parsed_body = nil
-    stub_request(:post, "https://api.anthropic.com/v1/messages").with(
+    stub_request(:post, url).with(
      body:
        proc do |req_body|
          parsed_body = JSON.parse(req_body, symbolize_names: true)
--- a/spec/lib/completions/endpoints/gemini_spec.rb
+++ b/spec/lib/completions/endpoints/gemini_spec.rb
@ -130,6 +130,17 @@ end
 RSpec.describe DiscourseAi::Completions::Endpoints::Gemini do
  subject(:endpoint) { described_class.new("gemini-pro", DiscourseAi::Tokenizer::OpenAiTokenizer) }
  fab!(:model) do
    Fabricate(
      :llm_model,
      url: "https://generativelanguage.googleapis.com/v1beta/models/gemini-1.5-pro-latest",
      name: "gemini-1.5-pro",
      provider: "google",
      api_key: "ABC",
      vision_enabled: true,
    )
  end
  fab!(:user)
  let(:image100x100) { plugin_file_from_fixtures("100x100.jpg") }
@ -144,8 +155,6 @@ RSpec.describe DiscourseAi::Completions::Endpoints::Gemini do
  end
  it "Supports Vision API" do
    SiteSetting.ai_gemini_api_key = "ABC"
    prompt =
      DiscourseAi::Completions::Prompt.new(
        "You are image bot",
@ -158,9 +167,8 @@ RSpec.describe DiscourseAi::Completions::Endpoints::Gemini do
    req_body = nil
-    llm = DiscourseAi::Completions::Llm.proxy("google:gemini-1.5-pro")
+    llm = DiscourseAi::Completions::Llm.proxy("custom:#{model.id}")
-    url =
+    url = "#{model.url}:generateContent?key=ABC"
      "https://generativelanguage.googleapis.com/v1beta/models/gemini-1.5-pro-latest:generateContent?key=ABC"
    stub_request(:post, url).with(
      body:
@ -202,8 +210,6 @@ RSpec.describe DiscourseAi::Completions::Endpoints::Gemini do
  end
  it "Can correctly handle streamed responses even if they are chunked badly" do
    SiteSetting.ai_gemini_api_key = "ABC"
    data = +""
    data << "da|ta: |"
    data << gemini_mock.response("Hello").to_json
@ -214,9 +220,8 @@ RSpec.describe DiscourseAi::Completions::Endpoints::Gemini do
    split = data.split("|")
-    llm = DiscourseAi::Completions::Llm.proxy("google:gemini-1.5-flash")
+    llm = DiscourseAi::Completions::Llm.proxy("custom:#{model.id}")
-    url =
+    url = "#{model.url}:streamGenerateContent?alt=sse&key=ABC"
      "https://generativelanguage.googleapis.com/v1beta/models/gemini-1.5-flash-latest:streamGenerateContent?alt=sse&key=ABC"
    output = +""
    gemini_mock.with_chunk_array_support do
--- a/spec/lib/completions/endpoints/open_ai_spec.rb
+++ b/spec/lib/completions/endpoints/open_ai_spec.rb
@ -258,7 +258,8 @@ RSpec.describe DiscourseAi::Completions::Endpoints::OpenAi do
  describe "image support" do
    it "can handle images" do
-      llm = DiscourseAi::Completions::Llm.proxy("open_ai:gpt-4-turbo")
+      model = Fabricate(:llm_model, provider: "open_ai", vision_enabled: true)
      llm = DiscourseAi::Completions::Llm.proxy("custom:#{model.id}")
      prompt =
        DiscourseAi::Completions::Prompt.new(
          "You are image bot",
--- a/spec/requests/ai_helper/assistant_controller_spec.rb
+++ b/spec/requests/ai_helper/assistant_controller_spec.rb
@ -112,43 +112,40 @@ RSpec.describe DiscourseAi::AiHelper::AssistantController do
      "A picture of a cat sitting on a table (#{I18n.t("discourse_ai.ai_helper.image_caption.attribution")})"
    end
    before { assign_fake_provider_to(:ai_helper_image_caption_model) }
    def request_caption(params)
      DiscourseAi::Completions::Llm.with_prepared_responses([caption]) do
        post "/discourse-ai/ai-helper/caption_image", params: params
        yield(response)
      end
    end
    context "when logged in as an allowed user" do
      fab!(:user) { Fabricate(:user, refresh_auto_groups: true) }
      before do
        sign_in(user)
        SiteSetting.ai_helper_allowed_groups = Group::AUTO_GROUPS[:trust_level_1]
        SiteSetting.ai_llava_endpoint = "https://example.com"
-        stub_request(:post, "https://example.com/predictions").to_return(
+        SiteSetting.ai_helper_allowed_groups = Group::AUTO_GROUPS[:trust_level_1]
          status: 200,
          body: { output: caption.gsub(" ", " |").split("|") }.to_json,
        )
      end
      it "returns the suggested caption for the image" do
-        post "/discourse-ai/ai-helper/caption_image",
+        request_caption({ image_url: image_url, image_url_type: "long_url" }) do |r|
-             params: {
+          expect(r.status).to eq(200)
-               image_url: image_url,
+          expect(r.parsed_body["caption"]).to eq(caption_with_attrs)
-               image_url_type: "long_url",
+        end
             }
        expect(response.status).to eq(200)
        expect(response.parsed_body["caption"]).to eq(caption_with_attrs)
      end
      context "when the image_url is a short_url" do
        let(:image_url) { upload.short_url }
        it "returns the suggested caption for the image" do
-          post "/discourse-ai/ai-helper/caption_image",
+          request_caption({ image_url: image_url, image_url_type: "short_url" }) do |r|
-               params: {
+            expect(r.status).to eq(200)
-                 image_url: image_url,
+            expect(r.parsed_body["caption"]).to eq(caption_with_attrs)
-                 image_url_type: "short_url",
+          end
               }
          expect(response.status).to eq(200)
          expect(response.parsed_body["caption"]).to eq(caption_with_attrs)
        end
      end
@ -156,27 +153,25 @@ RSpec.describe DiscourseAi::AiHelper::AssistantController do
        let(:image_url) { "#{Discourse.base_url}#{upload.short_path}" }
        it "returns the suggested caption for the image" do
-          post "/discourse-ai/ai-helper/caption_image",
+          request_caption({ image_url: image_url, image_url_type: "short_path" }) do |r|
-               params: {
+            expect(r.status).to eq(200)
-                 image_url: image_url,
+            expect(r.parsed_body["caption"]).to eq(caption_with_attrs)
-                 image_url_type: "short_path",
+          end
               }
          expect(response.status).to eq(200)
          expect(response.parsed_body["caption"]).to eq(caption_with_attrs)
        end
      end
      it "returns a 502 error when the completion call fails" do
-        stub_request(:post, "https://example.com/predictions").to_return(status: 502)
+        DiscourseAi::Completions::Llm.with_prepared_responses(
          [DiscourseAi::Completions::Endpoints::Base::CompletionFailed.new],
        ) do
          post "/discourse-ai/ai-helper/caption_image",
               params: {
                 image_url: image_url,
                 image_url_type: "long_url",
               }
-        post "/discourse-ai/ai-helper/caption_image",
+          expect(response.status).to eq(502)
-             params: {
+        end
               image_url: image_url,
               image_url_type: "long_url",
             }
        expect(response.status).to eq(502)
      end
      it "returns a 400 error when the image_url is blank" do
@ -211,9 +206,10 @@ RSpec.describe DiscourseAi::AiHelper::AssistantController do
          SiteSetting.provider = SiteSettings::DbProvider.new(SiteSetting)
          setup_s3
          stub_s3_store
          assign_fake_provider_to(:ai_helper_image_caption_model)
          SiteSetting.secure_uploads = true
          SiteSetting.ai_helper_allowed_groups = Group::AUTO_GROUPS[:trust_level_1]
-          SiteSetting.ai_llava_endpoint = "https://example.com"
+
          Group.find(SiteSetting.ai_helper_allowed_groups_map.first).add(user)
          user.reload
@ -242,14 +238,11 @@ RSpec.describe DiscourseAi::AiHelper::AssistantController do
        it "returns a 200 message and caption if user can access the secure upload" do
          group.add(user)
          post "/discourse-ai/ai-helper/caption_image",
               params: {
                 image_url: image_url,
                 image_url_type: "long_url",
               }
-          expect(response.status).to eq(200)
+          request_caption({ image_url: image_url, image_url_type: "long_url" }) do |r|
-          expect(response.parsed_body["caption"]).to eq(caption_with_attrs)
+            expect(r.status).to eq(200)
            expect(r.parsed_body["caption"]).to eq(caption_with_attrs)
          end
        end
        context "if the input URL is for a secure upload but not on the secure-uploads path" do
@ -257,13 +250,11 @@ RSpec.describe DiscourseAi::AiHelper::AssistantController do
          it "creates a signed URL properly and makes the caption" do
            group.add(user)
-            post "/discourse-ai/ai-helper/caption_image",
+
-                 params: {
+            request_caption({ image_url: image_url, image_url_type: "long_url" }) do |r|
-                   image_url: image_url,
+              expect(r.status).to eq(200)
-                   image_url_type: "long_url",
+              expect(r.parsed_body["caption"]).to eq(caption_with_attrs)
-                 }
+            end
            expect(response.status).to eq(200)
            expect(response.parsed_body["caption"]).to eq(caption_with_attrs)
          end
        end
      end
--- a/spec/system/ai_helper/ai_image_caption_spec.rb
+++ b/spec/system/ai_helper/ai_image_caption_spec.rb
@ -21,14 +21,9 @@ RSpec.describe "AI image caption", type: :system, js: true do
  before do
    Group.find_by(id: Group::AUTO_GROUPS[:admins]).add(user)
    assign_fake_provider_to(:ai_helper_model)
-    SiteSetting.ai_llava_endpoint = "https://example.com"
+    assign_fake_provider_to(:ai_helper_image_caption_model)
    SiteSetting.ai_helper_enabled_features = "image_caption"
    sign_in(user)
    stub_request(:post, "https://example.com/predictions").to_return(
      status: 200,
      body: { output: caption.gsub(" ", " |").split("|") }.to_json,
    )
  end
  shared_examples "shows no image caption button" do
@ -53,35 +48,41 @@ RSpec.describe "AI image caption", type: :system, js: true do
  context "when triggering caption with AI on desktop" do
    it "should show an image caption in an input field" do
-      visit("/latest")
+      DiscourseAi::Completions::Llm.with_prepared_responses([caption]) do
-      page.find("#create-topic").click
+        visit("/latest")
-      attach_file([file_path]) { composer.click_toolbar_button("upload") }
+        page.find("#create-topic").click
-      popup.click_generate_caption
+        attach_file([file_path]) { composer.click_toolbar_button("upload") }
-      expect(popup.has_caption_popup_value?(caption_with_attrs)).to eq(true)
+        popup.click_generate_caption
-      popup.save_caption
+        expect(popup.has_caption_popup_value?(caption_with_attrs)).to eq(true)
-      wait_for { page.find(".image-wrapper img")["alt"] == caption_with_attrs }
+        popup.save_caption
-      expect(page.find(".image-wrapper img")["alt"]).to eq(caption_with_attrs)
+        wait_for { page.find(".image-wrapper img")["alt"] == caption_with_attrs }
        expect(page.find(".image-wrapper img")["alt"]).to eq(caption_with_attrs)
      end
    end
    it "should allow you to cancel a caption request" do
-      visit("/latest")
+      DiscourseAi::Completions::Llm.with_prepared_responses([caption]) do
-      page.find("#create-topic").click
+        visit("/latest")
-      attach_file([file_path]) { composer.click_toolbar_button("upload") }
+        page.find("#create-topic").click
-      popup.click_generate_caption
+        attach_file([file_path]) { composer.click_toolbar_button("upload") }
-      popup.cancel_caption
+        popup.click_generate_caption
-      expect(popup).to have_no_disabled_generate_button
+        popup.cancel_caption
        expect(popup).to have_no_disabled_generate_button
      end
    end
  end
  context "when triggering caption with AI on mobile", mobile: true do
    it "should show update the image alt text with the caption" do
-      visit("/latest")
+      DiscourseAi::Completions::Llm.with_prepared_responses([caption]) do
-      page.find("#create-topic").click
+        visit("/latest")
-      attach_file([file_path]) { page.find(".mobile-file-upload").click }
+        page.find("#create-topic").click
-      page.find(".mobile-preview").click
+        attach_file([file_path]) { page.find(".mobile-file-upload").click }
-      popup.click_generate_caption
+        page.find(".mobile-preview").click
-      wait_for { page.find(".image-wrapper img")["alt"] == caption_with_attrs }
+        popup.click_generate_caption
-      expect(page.find(".image-wrapper img")["alt"]).to eq(caption_with_attrs)
+        wait_for { page.find(".image-wrapper img")["alt"] == caption_with_attrs }
        expect(page.find(".image-wrapper img")["alt"]).to eq(caption_with_attrs)
      end
    end
  end
@ -125,15 +126,17 @@ RSpec.describe "AI image caption", type: :system, js: true do
      end
      it "should auto caption the existing images and update the preference when dialog is accepted" do
-        visit("/latest")
+        DiscourseAi::Completions::Llm.with_prepared_responses([caption]) do
-        page.find("#create-topic").click
+          visit("/latest")
-        attach_file([file_path]) { composer.click_toolbar_button("upload") }
+          page.find("#create-topic").click
-        wait_for { composer.has_no_in_progress_uploads? }
+          attach_file([file_path]) { composer.click_toolbar_button("upload") }
-        composer.fill_title("I love using Discourse! It is my favorite forum software")
+          wait_for { composer.has_no_in_progress_uploads? }
-        composer.create
+          composer.fill_title("I love using Discourse! It is my favorite forum software")
-        dialog.click_yes
+          composer.create
-        wait_for(timeout: 100) { page.find("#post_1 .cooked img")["alt"] == caption_with_attrs }
+          dialog.click_yes
-        expect(page.find("#post_1 .cooked img")["alt"]).to eq(caption_with_attrs)
+          wait_for(timeout: 100) { page.find("#post_1 .cooked img")["alt"] == caption_with_attrs }
          expect(page.find("#post_1 .cooked img")["alt"]).to eq(caption_with_attrs)
        end
      end
    end
@ -142,14 +145,16 @@ RSpec.describe "AI image caption", type: :system, js: true do
      skip "TODO: Fix auto_image_caption user option not present in testing environment?" do
        it "should auto caption the image after uploading" do
-          visit("/latest")
+          DiscourseAi::Completions::Llm.with_prepared_responses([caption]) do
-          page.find("#create-topic").click
+            visit("/latest")
-          attach_file([Rails.root.join("spec/fixtures/images/logo.jpg")]) do
+            page.find("#create-topic").click
-            composer.click_toolbar_button("upload")
+            attach_file([Rails.root.join("spec/fixtures/images/logo.jpg")]) do
              composer.click_toolbar_button("upload")
            end
            wait_for { composer.has_no_in_progress_uploads? }
            wait_for { page.find(".image-wrapper img")["alt"] == caption_with_attrs }
            expect(page.find(".image-wrapper img")["alt"]).to eq(caption_with_attrs)
          end
          wait_for { composer.has_no_in_progress_uploads? }
          wait_for { page.find(".image-wrapper img")["alt"] == caption_with_attrs }
          expect(page.find(".image-wrapper img")["alt"]).to eq(caption_with_attrs)
        end
      end
    end