FEATURE: add support for all vision models (#646)
Previoulsy on GPT-4-vision was supported, change introduces support for Google/Anthropic and new OpenAI models Additionally this makes vision work properly in dev environments cause we sent the encoded payload via prompt vs sending urls
This commit is contained in:
parent
dd4e305ff7
commit
b487de933d
|
@ -107,21 +107,6 @@ module DiscourseAi
|
|||
status: 502
|
||||
end
|
||||
|
||||
def random_caption
|
||||
captions = [
|
||||
"A beautiful landscape",
|
||||
"An adorable puppy",
|
||||
"A delicious meal",
|
||||
"A cozy fireplace",
|
||||
"A stunning sunset",
|
||||
"A charming cityscape",
|
||||
"A peaceful garden",
|
||||
"A majestic mountain range",
|
||||
"A captivating work of art",
|
||||
]
|
||||
captions.sample
|
||||
end
|
||||
|
||||
def caption_image
|
||||
image_url = params[:image_url]
|
||||
image_url_type = params[:image_url_type]
|
||||
|
@ -138,19 +123,12 @@ module DiscourseAi
|
|||
end
|
||||
|
||||
raise Discourse::NotFound if image.blank?
|
||||
final_image_url = get_caption_url(image, image_url)
|
||||
|
||||
check_secure_upload_permission(image) if image.secure?
|
||||
user = current_user
|
||||
|
||||
hijack do
|
||||
if Rails.env.development?
|
||||
sleep 2 # Simulate a delay of 2 seconds
|
||||
caption = random_caption
|
||||
else
|
||||
caption =
|
||||
DiscourseAi::AiHelper::Assistant.new.generate_image_caption(
|
||||
final_image_url,
|
||||
current_user,
|
||||
)
|
||||
end
|
||||
caption = DiscourseAi::AiHelper::Assistant.new.generate_image_caption(image, user)
|
||||
render json: {
|
||||
caption:
|
||||
"#{caption} (#{I18n.t("discourse_ai.ai_helper.image_caption.attribution")})",
|
||||
|
@ -181,15 +159,6 @@ module DiscourseAi
|
|||
raise Discourse::InvalidAccess
|
||||
end
|
||||
end
|
||||
|
||||
def get_caption_url(image_upload, image_url)
|
||||
if image_upload.secure?
|
||||
check_secure_upload_permission(image_upload)
|
||||
return Discourse.store.url_for(image_upload)
|
||||
end
|
||||
|
||||
UrlHelper.absolute(image_url)
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
|
|
|
@ -246,9 +246,7 @@ discourse_ai:
|
|||
ai_helper_image_caption_model:
|
||||
default: "llava"
|
||||
type: enum
|
||||
choices:
|
||||
- "llava"
|
||||
- "open_ai:gpt-4-vision-preview"
|
||||
enum: "DiscourseAi::Configuration::LlmVisionEnumerator"
|
||||
ai_auto_image_caption_allowed_groups:
|
||||
client: true
|
||||
type: group_list
|
||||
|
|
|
@ -128,8 +128,10 @@ module DiscourseAi
|
|||
end
|
||||
end
|
||||
|
||||
def generate_image_caption(image_url, user)
|
||||
def generate_image_caption(upload, user)
|
||||
if SiteSetting.ai_helper_image_caption_model == "llava"
|
||||
image_url =
|
||||
upload.secure? ? Discourse.store.url_for(upload) : UrlHelper.absolute(upload.url)
|
||||
parameters = {
|
||||
input: {
|
||||
image: image_url,
|
||||
|
@ -144,17 +146,13 @@ module DiscourseAi
|
|||
else
|
||||
prompt =
|
||||
DiscourseAi::Completions::Prompt.new(
|
||||
"You are a bot specializing in image captioning.",
|
||||
messages: [
|
||||
{
|
||||
type: :user,
|
||||
content: [
|
||||
{
|
||||
type: "text",
|
||||
text:
|
||||
content:
|
||||
"Describe this image in a single sentence#{custom_locale_instructions(user)}",
|
||||
},
|
||||
{ type: "image_url", image_url: image_url },
|
||||
],
|
||||
upload_ids: [upload.id],
|
||||
},
|
||||
],
|
||||
skip_validations: true,
|
||||
|
@ -162,7 +160,7 @@ module DiscourseAi
|
|||
|
||||
DiscourseAi::Completions::Llm.proxy(SiteSetting.ai_helper_image_caption_model).generate(
|
||||
prompt,
|
||||
user: Discourse.system_user,
|
||||
user: user,
|
||||
max_tokens: 1024,
|
||||
feature_name: "image_caption",
|
||||
)
|
||||
|
|
|
@ -28,6 +28,15 @@ module DiscourseAi
|
|||
DiscourseAi::Tokenizer::BasicTokenizer.available_llm_tokenizers.map(&:name)
|
||||
end
|
||||
|
||||
def vision_models_by_provider
|
||||
@vision_models_by_provider ||= {
|
||||
aws_bedrock: %w[claude-3-sonnet claude-3-opus claude-3-haiku],
|
||||
anthropic: %w[claude-3-sonnet claude-3-opus claude-3-haiku],
|
||||
open_ai: %w[gpt-4-vision-preview gpt-4-turbo gpt-4o],
|
||||
google: %w[gemini-1.5-pro gemini-1.5-flash],
|
||||
}
|
||||
end
|
||||
|
||||
def models_by_provider
|
||||
# ChatGPT models are listed under open_ai but they are actually available through OpenAI and Azure.
|
||||
# However, since they use the same URL/key settings, there's no reason to duplicate them.
|
||||
|
|
|
@ -0,0 +1,34 @@
|
|||
# frozen_string_literal: true
|
||||
|
||||
require "enum_site_setting"
|
||||
|
||||
module DiscourseAi
|
||||
module Configuration
|
||||
class LlmVisionEnumerator < ::EnumSiteSetting
|
||||
def self.valid_value?(val)
|
||||
true
|
||||
end
|
||||
|
||||
def self.values
|
||||
begin
|
||||
result =
|
||||
DiscourseAi::Completions::Llm.vision_models_by_provider.flat_map do |provider, models|
|
||||
endpoint = DiscourseAi::Completions::Endpoints::Base.endpoint_for(provider.to_s)
|
||||
|
||||
models.map do |model_name|
|
||||
{ name: endpoint.display_name(model_name), value: "#{provider}:#{model_name}" }
|
||||
end
|
||||
end
|
||||
|
||||
result << { name: "Llava", value: "llava" }
|
||||
|
||||
result
|
||||
# TODO add support for LlmModel as well
|
||||
# LlmModel.all.each do |model|
|
||||
# llm_models << { name: model.display_name, value: "custom:#{model.id}" }
|
||||
# end
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
|
@ -87,11 +87,6 @@ RSpec.describe DiscourseAi::AiHelper::AssistantController do
|
|||
expected_diff =
|
||||
"<div class=\"inline-diff\"><p><ins>Un </ins><ins>usuario </ins><ins>escribio </ins><ins>esto</ins><del>A </del><del>user </del><del>wrote </del><del>this</del></p></div>"
|
||||
|
||||
expected_input = <<~TEXT.strip
|
||||
<input>Translate to Spanish:
|
||||
A user wrote this</input>
|
||||
TEXT
|
||||
|
||||
DiscourseAi::Completions::Llm.with_prepared_responses([translated_text]) do
|
||||
post "/discourse-ai/ai-helper/suggest",
|
||||
params: {
|
||||
|
|
Loading…
Reference in New Issue