FEATURE: add support for all vision models (#646)
Previoulsy on GPT-4-vision was supported, change introduces support for Google/Anthropic and new OpenAI models Additionally this makes vision work properly in dev environments cause we sent the encoded payload via prompt vs sending urls
This commit is contained in:
parent
dd4e305ff7
commit
b487de933d
|
@ -107,21 +107,6 @@ module DiscourseAi
|
||||||
status: 502
|
status: 502
|
||||||
end
|
end
|
||||||
|
|
||||||
def random_caption
|
|
||||||
captions = [
|
|
||||||
"A beautiful landscape",
|
|
||||||
"An adorable puppy",
|
|
||||||
"A delicious meal",
|
|
||||||
"A cozy fireplace",
|
|
||||||
"A stunning sunset",
|
|
||||||
"A charming cityscape",
|
|
||||||
"A peaceful garden",
|
|
||||||
"A majestic mountain range",
|
|
||||||
"A captivating work of art",
|
|
||||||
]
|
|
||||||
captions.sample
|
|
||||||
end
|
|
||||||
|
|
||||||
def caption_image
|
def caption_image
|
||||||
image_url = params[:image_url]
|
image_url = params[:image_url]
|
||||||
image_url_type = params[:image_url_type]
|
image_url_type = params[:image_url_type]
|
||||||
|
@ -138,19 +123,12 @@ module DiscourseAi
|
||||||
end
|
end
|
||||||
|
|
||||||
raise Discourse::NotFound if image.blank?
|
raise Discourse::NotFound if image.blank?
|
||||||
final_image_url = get_caption_url(image, image_url)
|
|
||||||
|
check_secure_upload_permission(image) if image.secure?
|
||||||
|
user = current_user
|
||||||
|
|
||||||
hijack do
|
hijack do
|
||||||
if Rails.env.development?
|
caption = DiscourseAi::AiHelper::Assistant.new.generate_image_caption(image, user)
|
||||||
sleep 2 # Simulate a delay of 2 seconds
|
|
||||||
caption = random_caption
|
|
||||||
else
|
|
||||||
caption =
|
|
||||||
DiscourseAi::AiHelper::Assistant.new.generate_image_caption(
|
|
||||||
final_image_url,
|
|
||||||
current_user,
|
|
||||||
)
|
|
||||||
end
|
|
||||||
render json: {
|
render json: {
|
||||||
caption:
|
caption:
|
||||||
"#{caption} (#{I18n.t("discourse_ai.ai_helper.image_caption.attribution")})",
|
"#{caption} (#{I18n.t("discourse_ai.ai_helper.image_caption.attribution")})",
|
||||||
|
@ -181,15 +159,6 @@ module DiscourseAi
|
||||||
raise Discourse::InvalidAccess
|
raise Discourse::InvalidAccess
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
def get_caption_url(image_upload, image_url)
|
|
||||||
if image_upload.secure?
|
|
||||||
check_secure_upload_permission(image_upload)
|
|
||||||
return Discourse.store.url_for(image_upload)
|
|
||||||
end
|
|
||||||
|
|
||||||
UrlHelper.absolute(image_url)
|
|
||||||
end
|
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
|
@ -246,9 +246,7 @@ discourse_ai:
|
||||||
ai_helper_image_caption_model:
|
ai_helper_image_caption_model:
|
||||||
default: "llava"
|
default: "llava"
|
||||||
type: enum
|
type: enum
|
||||||
choices:
|
enum: "DiscourseAi::Configuration::LlmVisionEnumerator"
|
||||||
- "llava"
|
|
||||||
- "open_ai:gpt-4-vision-preview"
|
|
||||||
ai_auto_image_caption_allowed_groups:
|
ai_auto_image_caption_allowed_groups:
|
||||||
client: true
|
client: true
|
||||||
type: group_list
|
type: group_list
|
||||||
|
|
|
@ -128,8 +128,10 @@ module DiscourseAi
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
def generate_image_caption(image_url, user)
|
def generate_image_caption(upload, user)
|
||||||
if SiteSetting.ai_helper_image_caption_model == "llava"
|
if SiteSetting.ai_helper_image_caption_model == "llava"
|
||||||
|
image_url =
|
||||||
|
upload.secure? ? Discourse.store.url_for(upload) : UrlHelper.absolute(upload.url)
|
||||||
parameters = {
|
parameters = {
|
||||||
input: {
|
input: {
|
||||||
image: image_url,
|
image: image_url,
|
||||||
|
@ -144,17 +146,13 @@ module DiscourseAi
|
||||||
else
|
else
|
||||||
prompt =
|
prompt =
|
||||||
DiscourseAi::Completions::Prompt.new(
|
DiscourseAi::Completions::Prompt.new(
|
||||||
|
"You are a bot specializing in image captioning.",
|
||||||
messages: [
|
messages: [
|
||||||
{
|
{
|
||||||
type: :user,
|
type: :user,
|
||||||
content: [
|
content:
|
||||||
{
|
"Describe this image in a single sentence#{custom_locale_instructions(user)}",
|
||||||
type: "text",
|
upload_ids: [upload.id],
|
||||||
text:
|
|
||||||
"Describe this image in a single sentence#{custom_locale_instructions(user)}",
|
|
||||||
},
|
|
||||||
{ type: "image_url", image_url: image_url },
|
|
||||||
],
|
|
||||||
},
|
},
|
||||||
],
|
],
|
||||||
skip_validations: true,
|
skip_validations: true,
|
||||||
|
@ -162,7 +160,7 @@ module DiscourseAi
|
||||||
|
|
||||||
DiscourseAi::Completions::Llm.proxy(SiteSetting.ai_helper_image_caption_model).generate(
|
DiscourseAi::Completions::Llm.proxy(SiteSetting.ai_helper_image_caption_model).generate(
|
||||||
prompt,
|
prompt,
|
||||||
user: Discourse.system_user,
|
user: user,
|
||||||
max_tokens: 1024,
|
max_tokens: 1024,
|
||||||
feature_name: "image_caption",
|
feature_name: "image_caption",
|
||||||
)
|
)
|
||||||
|
|
|
@ -28,6 +28,15 @@ module DiscourseAi
|
||||||
DiscourseAi::Tokenizer::BasicTokenizer.available_llm_tokenizers.map(&:name)
|
DiscourseAi::Tokenizer::BasicTokenizer.available_llm_tokenizers.map(&:name)
|
||||||
end
|
end
|
||||||
|
|
||||||
|
def vision_models_by_provider
|
||||||
|
@vision_models_by_provider ||= {
|
||||||
|
aws_bedrock: %w[claude-3-sonnet claude-3-opus claude-3-haiku],
|
||||||
|
anthropic: %w[claude-3-sonnet claude-3-opus claude-3-haiku],
|
||||||
|
open_ai: %w[gpt-4-vision-preview gpt-4-turbo gpt-4o],
|
||||||
|
google: %w[gemini-1.5-pro gemini-1.5-flash],
|
||||||
|
}
|
||||||
|
end
|
||||||
|
|
||||||
def models_by_provider
|
def models_by_provider
|
||||||
# ChatGPT models are listed under open_ai but they are actually available through OpenAI and Azure.
|
# ChatGPT models are listed under open_ai but they are actually available through OpenAI and Azure.
|
||||||
# However, since they use the same URL/key settings, there's no reason to duplicate them.
|
# However, since they use the same URL/key settings, there's no reason to duplicate them.
|
||||||
|
|
|
@ -0,0 +1,34 @@
|
||||||
|
# frozen_string_literal: true
|
||||||
|
|
||||||
|
require "enum_site_setting"
|
||||||
|
|
||||||
|
module DiscourseAi
|
||||||
|
module Configuration
|
||||||
|
class LlmVisionEnumerator < ::EnumSiteSetting
|
||||||
|
def self.valid_value?(val)
|
||||||
|
true
|
||||||
|
end
|
||||||
|
|
||||||
|
def self.values
|
||||||
|
begin
|
||||||
|
result =
|
||||||
|
DiscourseAi::Completions::Llm.vision_models_by_provider.flat_map do |provider, models|
|
||||||
|
endpoint = DiscourseAi::Completions::Endpoints::Base.endpoint_for(provider.to_s)
|
||||||
|
|
||||||
|
models.map do |model_name|
|
||||||
|
{ name: endpoint.display_name(model_name), value: "#{provider}:#{model_name}" }
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
result << { name: "Llava", value: "llava" }
|
||||||
|
|
||||||
|
result
|
||||||
|
# TODO add support for LlmModel as well
|
||||||
|
# LlmModel.all.each do |model|
|
||||||
|
# llm_models << { name: model.display_name, value: "custom:#{model.id}" }
|
||||||
|
# end
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
|
@ -87,11 +87,6 @@ RSpec.describe DiscourseAi::AiHelper::AssistantController do
|
||||||
expected_diff =
|
expected_diff =
|
||||||
"<div class=\"inline-diff\"><p><ins>Un </ins><ins>usuario </ins><ins>escribio </ins><ins>esto</ins><del>A </del><del>user </del><del>wrote </del><del>this</del></p></div>"
|
"<div class=\"inline-diff\"><p><ins>Un </ins><ins>usuario </ins><ins>escribio </ins><ins>esto</ins><del>A </del><del>user </del><del>wrote </del><del>this</del></p></div>"
|
||||||
|
|
||||||
expected_input = <<~TEXT.strip
|
|
||||||
<input>Translate to Spanish:
|
|
||||||
A user wrote this</input>
|
|
||||||
TEXT
|
|
||||||
|
|
||||||
DiscourseAi::Completions::Llm.with_prepared_responses([translated_text]) do
|
DiscourseAi::Completions::Llm.with_prepared_responses([translated_text]) do
|
||||||
post "/discourse-ai/ai-helper/suggest",
|
post "/discourse-ai/ai-helper/suggest",
|
||||||
params: {
|
params: {
|
||||||
|
|
Loading…
Reference in New Issue