FEATURE: add support for all vision models (#646)

Previoulsy on GPT-4-vision was supported, change introduces support for Google/Anthropic and new OpenAI models Additionally this makes vision work properly in dev environments cause we sent the encoded payload via prompt vs sending urls
2024-05-28 23:31:15 +10:00 · 2024-05-28 23:31:15 +10:00 · b487de933d
parent dd4e305ff7
commit b487de933d
6 changed files with 56 additions and 53 deletions
--- a/app/controllers/discourse_ai/ai_helper/assistant_controller.rb
+++ b/app/controllers/discourse_ai/ai_helper/assistant_controller.rb
@ -107,21 +107,6 @@ module DiscourseAi
                          status: 502
      end

-      def random_caption
-        captions = [
-          "A beautiful landscape",
-          "An adorable puppy",
-          "A delicious meal",
-          "A cozy fireplace",
-          "A stunning sunset",
-          "A charming cityscape",
-          "A peaceful garden",
-          "A majestic mountain range",
-          "A captivating work of art",
-        ]
-        captions.sample
-      end
-
      def caption_image
        image_url = params[:image_url]
        image_url_type = params[:image_url_type]
@ -138,19 +123,12 @@ module DiscourseAi
        end

        raise Discourse::NotFound if image.blank?
-        final_image_url = get_caption_url(image, image_url)
+
+        check_secure_upload_permission(image) if image.secure?
+        user = current_user

        hijack do
-          if Rails.env.development?
-            sleep 2 # Simulate a delay of 2 seconds
-            caption = random_caption
-          else
-            caption =
-              DiscourseAi::AiHelper::Assistant.new.generate_image_caption(
-                final_image_url,
-                current_user,
-              )
-          end
+          caption = DiscourseAi::AiHelper::Assistant.new.generate_image_caption(image, user)
          render json: {
                   caption:
                     "#{caption} (#{I18n.t("discourse_ai.ai_helper.image_caption.attribution")})",
@ -181,15 +159,6 @@ module DiscourseAi
          raise Discourse::InvalidAccess
        end
      end
-
-      def get_caption_url(image_upload, image_url)
-        if image_upload.secure?
-          check_secure_upload_permission(image_upload)
-          return Discourse.store.url_for(image_upload)
-        end
-
-        UrlHelper.absolute(image_url)
-      end
    end
  end
 end
--- a/config/settings.yml
+++ b/config/settings.yml
@ -246,9 +246,7 @@ discourse_ai:
  ai_helper_image_caption_model:
    default: "llava"
    type: enum
-    choices:
-      - "llava"
-      - "open_ai:gpt-4-vision-preview"
+    enum: "DiscourseAi::Configuration::LlmVisionEnumerator"
  ai_auto_image_caption_allowed_groups:
    client: true
    type: group_list
--- a/lib/ai_helper/assistant.rb
+++ b/lib/ai_helper/assistant.rb
@ -128,8 +128,10 @@ module DiscourseAi
        end
      end

-      def generate_image_caption(image_url, user)
+      def generate_image_caption(upload, user)
        if SiteSetting.ai_helper_image_caption_model == "llava"
+          image_url =
+            upload.secure? ? Discourse.store.url_for(upload) : UrlHelper.absolute(upload.url)
          parameters = {
            input: {
              image: image_url,
@ -144,17 +146,13 @@ module DiscourseAi
        else
          prompt =
            DiscourseAi::Completions::Prompt.new(
+              "You are a bot specializing in image captioning.",
              messages: [
                {
                  type: :user,
-                  content: [
-                    {
-                      type: "text",
-                      text:
+                  content:
                    "Describe this image in a single sentence#{custom_locale_instructions(user)}",
-                    },
-                    { type: "image_url", image_url: image_url },
-                  ],
+                  upload_ids: [upload.id],
                },
              ],
              skip_validations: true,
@ -162,7 +160,7 @@ module DiscourseAi

          DiscourseAi::Completions::Llm.proxy(SiteSetting.ai_helper_image_caption_model).generate(
            prompt,
-            user: Discourse.system_user,
+            user: user,
            max_tokens: 1024,
            feature_name: "image_caption",
          )
--- a/lib/completions/llm.rb
+++ b/lib/completions/llm.rb
@ -28,6 +28,15 @@ module DiscourseAi
          DiscourseAi::Tokenizer::BasicTokenizer.available_llm_tokenizers.map(&:name)
        end

+        def vision_models_by_provider
+          @vision_models_by_provider ||= {
+            aws_bedrock: %w[claude-3-sonnet claude-3-opus claude-3-haiku],
+            anthropic: %w[claude-3-sonnet claude-3-opus claude-3-haiku],
+            open_ai: %w[gpt-4-vision-preview gpt-4-turbo gpt-4o],
+            google: %w[gemini-1.5-pro gemini-1.5-flash],
+          }
+        end
+
        def models_by_provider
          # ChatGPT models are listed under open_ai but they are actually available through OpenAI and Azure.
          # However, since they use the same URL/key settings, there's no reason to duplicate them.
--- a/lib/configuration/llm_vision_enumerator.rb
+++ b/lib/configuration/llm_vision_enumerator.rb
@ -0,0 +1,34 @@
+# frozen_string_literal: true
+
+require "enum_site_setting"
+
+module DiscourseAi
+  module Configuration
+    class LlmVisionEnumerator < ::EnumSiteSetting
+      def self.valid_value?(val)
+        true
+      end
+
+      def self.values
+        begin
+          result =
+            DiscourseAi::Completions::Llm.vision_models_by_provider.flat_map do |provider, models|
+              endpoint = DiscourseAi::Completions::Endpoints::Base.endpoint_for(provider.to_s)
+
+              models.map do |model_name|
+                { name: endpoint.display_name(model_name), value: "#{provider}:#{model_name}" }
+              end
+            end
+
+          result << { name: "Llava", value: "llava" }
+
+          result
+          # TODO add support for LlmModel as well
+          # LlmModel.all.each do |model|
+          #  llm_models << { name: model.display_name, value: "custom:#{model.id}" }
+          # end
+        end
+      end
+    end
+  end
+end
--- a/spec/requests/ai_helper/assistant_controller_spec.rb
+++ b/spec/requests/ai_helper/assistant_controller_spec.rb
@ -87,11 +87,6 @@ RSpec.describe DiscourseAi::AiHelper::AssistantController do
        expected_diff =
          "<div class=\"inline-diff\"><p><ins>Un </ins><ins>usuario </ins><ins>escribio </ins><ins>esto</ins><del>A </del><del>user </del><del>wrote </del><del>this</del></p></div>"

-        expected_input = <<~TEXT.strip
-        <input>Translate to Spanish:
-        A user wrote this</input>
-        TEXT
-
        DiscourseAi::Completions::Llm.with_prepared_responses([translated_text]) do
          post "/discourse-ai/ai-helper/suggest",
               params: {