FEATURE: add support for all vision models (#646)

Previoulsy on GPT-4-vision was supported, change introduces support
for Google/Anthropic and new OpenAI models

Additionally this makes vision work properly in dev environments
cause we sent the encoded payload via prompt vs sending urls
This commit is contained in:
Sam 2024-05-28 23:31:15 +10:00 committed by GitHub
parent dd4e305ff7
commit b487de933d
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
6 changed files with 56 additions and 53 deletions

View File

@ -107,21 +107,6 @@ module DiscourseAi
status: 502
end
def random_caption
captions = [
"A beautiful landscape",
"An adorable puppy",
"A delicious meal",
"A cozy fireplace",
"A stunning sunset",
"A charming cityscape",
"A peaceful garden",
"A majestic mountain range",
"A captivating work of art",
]
captions.sample
end
def caption_image
image_url = params[:image_url]
image_url_type = params[:image_url_type]
@ -138,19 +123,12 @@ module DiscourseAi
end
raise Discourse::NotFound if image.blank?
final_image_url = get_caption_url(image, image_url)
check_secure_upload_permission(image) if image.secure?
user = current_user
hijack do
if Rails.env.development?
sleep 2 # Simulate a delay of 2 seconds
caption = random_caption
else
caption =
DiscourseAi::AiHelper::Assistant.new.generate_image_caption(
final_image_url,
current_user,
)
end
caption = DiscourseAi::AiHelper::Assistant.new.generate_image_caption(image, user)
render json: {
caption:
"#{caption} (#{I18n.t("discourse_ai.ai_helper.image_caption.attribution")})",
@ -181,15 +159,6 @@ module DiscourseAi
raise Discourse::InvalidAccess
end
end
def get_caption_url(image_upload, image_url)
if image_upload.secure?
check_secure_upload_permission(image_upload)
return Discourse.store.url_for(image_upload)
end
UrlHelper.absolute(image_url)
end
end
end
end

View File

@ -246,9 +246,7 @@ discourse_ai:
ai_helper_image_caption_model:
default: "llava"
type: enum
choices:
- "llava"
- "open_ai:gpt-4-vision-preview"
enum: "DiscourseAi::Configuration::LlmVisionEnumerator"
ai_auto_image_caption_allowed_groups:
client: true
type: group_list

View File

@ -128,8 +128,10 @@ module DiscourseAi
end
end
def generate_image_caption(image_url, user)
def generate_image_caption(upload, user)
if SiteSetting.ai_helper_image_caption_model == "llava"
image_url =
upload.secure? ? Discourse.store.url_for(upload) : UrlHelper.absolute(upload.url)
parameters = {
input: {
image: image_url,
@ -144,17 +146,13 @@ module DiscourseAi
else
prompt =
DiscourseAi::Completions::Prompt.new(
"You are a bot specializing in image captioning.",
messages: [
{
type: :user,
content: [
{
type: "text",
text:
"Describe this image in a single sentence#{custom_locale_instructions(user)}",
},
{ type: "image_url", image_url: image_url },
],
content:
"Describe this image in a single sentence#{custom_locale_instructions(user)}",
upload_ids: [upload.id],
},
],
skip_validations: true,
@ -162,7 +160,7 @@ module DiscourseAi
DiscourseAi::Completions::Llm.proxy(SiteSetting.ai_helper_image_caption_model).generate(
prompt,
user: Discourse.system_user,
user: user,
max_tokens: 1024,
feature_name: "image_caption",
)

View File

@ -28,6 +28,15 @@ module DiscourseAi
DiscourseAi::Tokenizer::BasicTokenizer.available_llm_tokenizers.map(&:name)
end
def vision_models_by_provider
@vision_models_by_provider ||= {
aws_bedrock: %w[claude-3-sonnet claude-3-opus claude-3-haiku],
anthropic: %w[claude-3-sonnet claude-3-opus claude-3-haiku],
open_ai: %w[gpt-4-vision-preview gpt-4-turbo gpt-4o],
google: %w[gemini-1.5-pro gemini-1.5-flash],
}
end
def models_by_provider
# ChatGPT models are listed under open_ai but they are actually available through OpenAI and Azure.
# However, since they use the same URL/key settings, there's no reason to duplicate them.

View File

@ -0,0 +1,34 @@
# frozen_string_literal: true
require "enum_site_setting"
module DiscourseAi
module Configuration
class LlmVisionEnumerator < ::EnumSiteSetting
def self.valid_value?(val)
true
end
def self.values
begin
result =
DiscourseAi::Completions::Llm.vision_models_by_provider.flat_map do |provider, models|
endpoint = DiscourseAi::Completions::Endpoints::Base.endpoint_for(provider.to_s)
models.map do |model_name|
{ name: endpoint.display_name(model_name), value: "#{provider}:#{model_name}" }
end
end
result << { name: "Llava", value: "llava" }
result
# TODO add support for LlmModel as well
# LlmModel.all.each do |model|
# llm_models << { name: model.display_name, value: "custom:#{model.id}" }
# end
end
end
end
end
end

View File

@ -87,11 +87,6 @@ RSpec.describe DiscourseAi::AiHelper::AssistantController do
expected_diff =
"<div class=\"inline-diff\"><p><ins>Un </ins><ins>usuario </ins><ins>escribio </ins><ins>esto</ins><del>A </del><del>user </del><del>wrote </del><del>this</del></p></div>"
expected_input = <<~TEXT.strip
<input>Translate to Spanish:
A user wrote this</input>
TEXT
DiscourseAi::Completions::Llm.with_prepared_responses([translated_text]) do
post "/discourse-ai/ai-helper/suggest",
params: {