diff --git a/app/controllers/discourse_ai/ai_helper/assistant_controller.rb b/app/controllers/discourse_ai/ai_helper/assistant_controller.rb index c8bbd2a9..43eba6d1 100644 --- a/app/controllers/discourse_ai/ai_helper/assistant_controller.rb +++ b/app/controllers/discourse_ai/ai_helper/assistant_controller.rb @@ -107,21 +107,6 @@ module DiscourseAi status: 502 end - def random_caption - captions = [ - "A beautiful landscape", - "An adorable puppy", - "A delicious meal", - "A cozy fireplace", - "A stunning sunset", - "A charming cityscape", - "A peaceful garden", - "A majestic mountain range", - "A captivating work of art", - ] - captions.sample - end - def caption_image image_url = params[:image_url] image_url_type = params[:image_url_type] @@ -138,19 +123,12 @@ module DiscourseAi end raise Discourse::NotFound if image.blank? - final_image_url = get_caption_url(image, image_url) + + check_secure_upload_permission(image) if image.secure? + user = current_user hijack do - if Rails.env.development? - sleep 2 # Simulate a delay of 2 seconds - caption = random_caption - else - caption = - DiscourseAi::AiHelper::Assistant.new.generate_image_caption( - final_image_url, - current_user, - ) - end + caption = DiscourseAi::AiHelper::Assistant.new.generate_image_caption(image, user) render json: { caption: "#{caption} (#{I18n.t("discourse_ai.ai_helper.image_caption.attribution")})", @@ -181,15 +159,6 @@ module DiscourseAi raise Discourse::InvalidAccess end end - - def get_caption_url(image_upload, image_url) - if image_upload.secure? - check_secure_upload_permission(image_upload) - return Discourse.store.url_for(image_upload) - end - - UrlHelper.absolute(image_url) - end end end end diff --git a/config/settings.yml b/config/settings.yml index a12ab540..fe880e2c 100644 --- a/config/settings.yml +++ b/config/settings.yml @@ -246,9 +246,7 @@ discourse_ai: ai_helper_image_caption_model: default: "llava" type: enum - choices: - - "llava" - - "open_ai:gpt-4-vision-preview" + enum: "DiscourseAi::Configuration::LlmVisionEnumerator" ai_auto_image_caption_allowed_groups: client: true type: group_list diff --git a/lib/ai_helper/assistant.rb b/lib/ai_helper/assistant.rb index 25c177a9..32cbc248 100644 --- a/lib/ai_helper/assistant.rb +++ b/lib/ai_helper/assistant.rb @@ -128,8 +128,10 @@ module DiscourseAi end end - def generate_image_caption(image_url, user) + def generate_image_caption(upload, user) if SiteSetting.ai_helper_image_caption_model == "llava" + image_url = + upload.secure? ? Discourse.store.url_for(upload) : UrlHelper.absolute(upload.url) parameters = { input: { image: image_url, @@ -144,17 +146,13 @@ module DiscourseAi else prompt = DiscourseAi::Completions::Prompt.new( + "You are a bot specializing in image captioning.", messages: [ { type: :user, - content: [ - { - type: "text", - text: - "Describe this image in a single sentence#{custom_locale_instructions(user)}", - }, - { type: "image_url", image_url: image_url }, - ], + content: + "Describe this image in a single sentence#{custom_locale_instructions(user)}", + upload_ids: [upload.id], }, ], skip_validations: true, @@ -162,7 +160,7 @@ module DiscourseAi DiscourseAi::Completions::Llm.proxy(SiteSetting.ai_helper_image_caption_model).generate( prompt, - user: Discourse.system_user, + user: user, max_tokens: 1024, feature_name: "image_caption", ) diff --git a/lib/completions/llm.rb b/lib/completions/llm.rb index 2c71adfe..70331fb7 100644 --- a/lib/completions/llm.rb +++ b/lib/completions/llm.rb @@ -28,6 +28,15 @@ module DiscourseAi DiscourseAi::Tokenizer::BasicTokenizer.available_llm_tokenizers.map(&:name) end + def vision_models_by_provider + @vision_models_by_provider ||= { + aws_bedrock: %w[claude-3-sonnet claude-3-opus claude-3-haiku], + anthropic: %w[claude-3-sonnet claude-3-opus claude-3-haiku], + open_ai: %w[gpt-4-vision-preview gpt-4-turbo gpt-4o], + google: %w[gemini-1.5-pro gemini-1.5-flash], + } + end + def models_by_provider # ChatGPT models are listed under open_ai but they are actually available through OpenAI and Azure. # However, since they use the same URL/key settings, there's no reason to duplicate them. diff --git a/lib/configuration/llm_vision_enumerator.rb b/lib/configuration/llm_vision_enumerator.rb new file mode 100644 index 00000000..f57f4587 --- /dev/null +++ b/lib/configuration/llm_vision_enumerator.rb @@ -0,0 +1,34 @@ +# frozen_string_literal: true + +require "enum_site_setting" + +module DiscourseAi + module Configuration + class LlmVisionEnumerator < ::EnumSiteSetting + def self.valid_value?(val) + true + end + + def self.values + begin + result = + DiscourseAi::Completions::Llm.vision_models_by_provider.flat_map do |provider, models| + endpoint = DiscourseAi::Completions::Endpoints::Base.endpoint_for(provider.to_s) + + models.map do |model_name| + { name: endpoint.display_name(model_name), value: "#{provider}:#{model_name}" } + end + end + + result << { name: "Llava", value: "llava" } + + result + # TODO add support for LlmModel as well + # LlmModel.all.each do |model| + # llm_models << { name: model.display_name, value: "custom:#{model.id}" } + # end + end + end + end + end +end diff --git a/spec/requests/ai_helper/assistant_controller_spec.rb b/spec/requests/ai_helper/assistant_controller_spec.rb index 0a6cbf68..de8c2ca0 100644 --- a/spec/requests/ai_helper/assistant_controller_spec.rb +++ b/spec/requests/ai_helper/assistant_controller_spec.rb @@ -87,11 +87,6 @@ RSpec.describe DiscourseAi::AiHelper::AssistantController do expected_diff = "
Un usuario escribio estoA user wrote this