diff --git a/assets/javascripts/discourse/components/ai-llms-list-editor.gjs b/assets/javascripts/discourse/components/ai-llms-list-editor.gjs index 5413a3a3..1afe5d8f 100644 --- a/assets/javascripts/discourse/components/ai-llms-list-editor.gjs +++ b/assets/javascripts/discourse/components/ai-llms-list-editor.gjs @@ -32,7 +32,7 @@ export default class AiLlmsListEditor extends Component { key = `${llm.provider}-${llm.name}`; } else { // case of preset - key = llm.id.replace(/\./g, "-"); + key = llm.id.replace(/[.:\/]/g, "-"); } key = `discourse_ai.llms.model_description.${key}`; diff --git a/config/locales/client.en.yml b/config/locales/client.en.yml index 1347816a..2da85c87 100644 --- a/config/locales/client.en.yml +++ b/config/locales/client.en.yml @@ -539,14 +539,16 @@ en: model_description: none: "General settings that work for most language models" - anthropic-claude-3-7-sonnet: "Anthropic's most intelligent model" - anthropic-claude-3-5-haiku: "Fast and cost-effective" - anthropic-claude-3-opus: "Excels at writing and complex tasks" - google-gemini-2-5-pro: "Mid-sized multimodal model capable of a wide range of tasks" - google-gemini-2-0-flash: "Lightweight, fast, and cost-efficient with multimodal reasoning" + anthropic-claude-opus-4-0: "Anthropic's most intelligent model" + anthropic-claude-sonnet-4-0: "Optimal balance of speed and cost" + anthropic-claude-3-7-sonnet-latest: "Optimal balance of speed and cost (previous generation)" + anthropic-claude-3-5-haiku-latest: "Fast and cost-effective" + google-gemini-2-5-pro: "Large multimodal model capable of a wide range of tasks" + google-gemini-2-0-flash: "Lightweight, fast, and cost-efficient with multimodal reasoning (previous generation)" + google-gemini-2-5-flash: "Lightweight, fast, and cost-efficient with multimodal reasoning" google-gemini-2-0-flash-lite: "Cost efficient and low latency model" - open_ai-o1: "Open AI's most capable reasoning model" - open_ai-o3-mini: "Advanced Cost-efficient reasoning model" + open_ai-o3: "Open AI's most capable reasoning model" + open_ai-o4-mini: "Advanced Cost-efficient reasoning model" open_ai-gpt-4-1: "Open AI's flagship model. It is well suited for problem solving across domains" open_ai-gpt-4-1-mini: "Provides a balance between intelligence, speed, and cost that makes it an attractive model for many use cases." open_ai-gpt-4-1-nano: "Fastest, most cost-effective GPT-4.1 model." @@ -554,6 +556,9 @@ en: samba_nova-Meta-Llama-3-3-70B-Instruct": "Powerful multipurpose model" mistral-mistral-large-latest: "Mistral's most powerful model" mistral-pixtral-large-latest: "Mistral's most powerful vision capable model" + open_router-x-ai-grok-3-beta: "xAI's latest model" + open_router-deepseek-deepseek-r1-0528-free: "DeepSeek's latest reasoning model" + open_router-meta-llama-3-3-70b-instruct: "Highly capable multilingual model" preseeded_model_description: "Pre-configured open-source model utilizing %{model}" diff --git a/lib/completions/endpoints/anthropic.rb b/lib/completions/endpoints/anthropic.rb index 4155db2a..0ca940c5 100644 --- a/lib/completions/endpoints/anthropic.rb +++ b/lib/completions/endpoints/anthropic.rb @@ -31,6 +31,12 @@ module DiscourseAi "claude-3-opus-20240229" when "claude-3-5-sonnet" "claude-3-5-sonnet-latest" + when "claude-3-7-sonnet" + "claude-3-7-sonnet-latest" + when "claude-4-opus" + "claude-4-opus-20250514" + when "claude-4-sonnet" + "claude-4-sonnet-20250514" else llm_model.name end @@ -92,7 +98,6 @@ module DiscourseAi default_options(dialect).merge(model_params.except(:response_format)).merge( messages: prompt.messages, ) - payload[:system] = prompt.system_prompt if prompt.system_prompt.present? payload[:stream] = true if @streaming_mode diff --git a/lib/completions/endpoints/aws_bedrock.rb b/lib/completions/endpoints/aws_bedrock.rb index f4336ac1..f1344f3a 100644 --- a/lib/completions/endpoints/aws_bedrock.rb +++ b/lib/completions/endpoints/aws_bedrock.rb @@ -120,6 +120,7 @@ module DiscourseAi default_options(dialect).merge(model_params.except(:response_format)).merge( messages: prompt.messages, ) + payload[:system] = prompt.system_prompt if prompt.system_prompt.present? prefilled_message = +"" diff --git a/lib/completions/endpoints/base.rb b/lib/completions/endpoints/base.rb index bd06b540..e0f74025 100644 --- a/lib/completions/endpoints/base.rb +++ b/lib/completions/endpoints/base.rb @@ -48,6 +48,14 @@ module DiscourseAi @llm_model = llm_model end + def enforce_max_output_tokens(value) + if @llm_model.max_output_tokens.to_i > 0 + value = @llm_model.max_output_tokens if (value.to_i > @llm_model.max_output_tokens) || + (value.to_i <= 0) + end + value + end + def use_ssl? if model_uri&.scheme.present? model_uri.scheme == "https" @@ -83,6 +91,8 @@ module DiscourseAi @partial_tool_calls = partial_tool_calls @output_thinking = output_thinking + max_tokens = enforce_max_output_tokens(model_params[:max_tokens]) + model_params[:max_tokens] = max_tokens if max_tokens model_params = normalize_model_params(model_params) orig_blk = blk diff --git a/lib/completions/endpoints/gemini.rb b/lib/completions/endpoints/gemini.rb index 1db5b76f..7f12c3f3 100644 --- a/lib/completions/endpoints/gemini.rb +++ b/lib/completions/endpoints/gemini.rb @@ -63,6 +63,7 @@ module DiscourseAi tools = dialect.tools if @native_tool_support payload = default_options.merge(contents: prompt[:messages]) + payload[:systemInstruction] = { role: "system", parts: [{ text: prompt[:system_instruction].to_s }], diff --git a/lib/completions/llm.rb b/lib/completions/llm.rb index 04d517cb..8eb85ce0 100644 --- a/lib/completions/llm.rb +++ b/lib/completions/llm.rb @@ -27,7 +27,7 @@ module DiscourseAi id: "anthropic", models: [ { - name: "claude-3-7-sonnet", + name: "claude-3-7-sonnet-latest", tokens: 200_000, display_name: "Claude 3.7 Sonnet", input_cost: 3, @@ -35,7 +35,15 @@ module DiscourseAi output_cost: 15, }, { - name: "claude-3-5-haiku", + name: "claude-sonnet-4-0", + tokens: 200_000, + display_name: "Claude 4 Sonnet", + input_cost: 3, + cached_input_cost: 0.30, + output_cost: 15, + }, + { + name: "claude-3-5-haiku-latest", tokens: 200_000, display_name: "Claude 3.5 Haiku", input_cost: 0.80, @@ -43,9 +51,9 @@ module DiscourseAi output_cost: 4, }, { - name: "claude-3-opus", + name: "claude-opus-4-0", tokens: 200_000, - display_name: "Claude 3 Opus", + display_name: "Claude 4 Opus", input_cost: 15, cached_input_cost: 1.50, output_cost: 75, @@ -62,8 +70,19 @@ module DiscourseAi name: "gemini-2.5-pro", tokens: 800_000, endpoint: - "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-pro-preview-03-25", + "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-pro", display_name: "Gemini 2.5 Pro", + input_cost: 1.25, + oputput_cost: 10.0, + }, + { + name: "gemini-2.5-flash", + tokens: 800_000, + endpoint: + "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash", + display_name: "Gemini 2.5 Pro", + input_cost: 0.30, + output_cost: 2.50, }, { name: "gemini-2.0-flash", @@ -71,6 +90,8 @@ module DiscourseAi endpoint: "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash", display_name: "Gemini 2.0 Flash", + input_cost: 0.10, + output_cost: 0.40, }, { name: "gemini-2.0-flash-lite", @@ -89,20 +110,20 @@ module DiscourseAi id: "open_ai", models: [ { - name: "o3-mini", + name: "o4-mini", tokens: 200_000, - display_name: "o3 Mini", + display_name: "o4 Mini", input_cost: 1.10, - cached_input_cost: 0.55, + cached_input_cost: 0.275, output_cost: 4.40, }, { - name: "o1", + name: "o3", tokens: 200_000, - display_name: "o1", - input_cost: 15, - cached_input_cost: 7.50, - output_cost: 60, + display_name: "o3", + input_cost: 2, + cached_input_cost: 0.5, + output_cost: 8, }, { name: "gpt-4.1", @@ -177,14 +198,23 @@ module DiscourseAi id: "open_router", models: [ { - name: "meta-llama/llama-3.3-70b-instruct", - tokens: 128_000, - display_name: "Llama 3.3 70B", + name: "x-ai/grok-3-beta", + tokens: 131_072, + display_name: "xAI Grok 3 Beta", + input_cost: 3, + output_cost: 15, }, { - name: "google/gemini-flash-1.5-exp", - tokens: 1_000_000, - display_name: "Gemini Flash 1.5 Exp", + name: "deepseek/deepseek-r1-0528:free", + tokens: 163_000, + display_name: "DeepSeek R1 0528 - free", + }, + { + name: "meta-llama/llama-3.3-70b-instruct", + tokens: 131_072, + display_name: "Llama 3.3 70B Instruct", + input_cost: 0.05, + output_cost: 0.25, }, ], tokenizer: DiscourseAi::Tokenizer::OpenAiTokenizer, diff --git a/spec/lib/completions/endpoints/anthropic_spec.rb b/spec/lib/completions/endpoints/anthropic_spec.rb index 24d7e0f5..8a40c213 100644 --- a/spec/lib/completions/endpoints/anthropic_spec.rb +++ b/spec/lib/completions/endpoints/anthropic_spec.rb @@ -665,6 +665,51 @@ RSpec.describe DiscourseAi::Completions::Endpoints::Anthropic do expect(log.response_tokens).to eq(30) end + describe "max output tokens" do + it "it respects max output tokens supplied to model unconditionally, even with thinking" do + model.update!( + provider_params: { + enable_reasoning: true, + reasoning_tokens: 1000, + }, + max_output_tokens: 2000, + ) + + parsed_body = nil + stub_request(:post, url).with( + body: + proc do |req_body| + parsed_body = JSON.parse(req_body, symbolize_names: true) + true + end, + headers: { + "Content-Type" => "application/json", + "X-Api-Key" => "123", + "Anthropic-Version" => "2023-06-01", + }, + ).to_return( + status: 200, + body: { + id: "msg_123", + type: "message", + role: "assistant", + content: [{ type: "text", text: "test response" }], + model: "claude-3-opus-20240229", + usage: { + input_tokens: 10, + output_tokens: 5, + }, + }.to_json, + ) + + llm.generate(prompt, user: Discourse.system_user, max_tokens: 2500) + expect(parsed_body[:max_tokens]).to eq(2000) + + llm.generate(prompt, user: Discourse.system_user) + expect(parsed_body[:max_tokens]).to eq(2000) + end + end + describe "parameter disabling" do it "excludes disabled parameters from the request" do model.update!(provider_params: { disable_top_p: true, disable_temperature: true }) diff --git a/spec/lib/completions/endpoints/gemini_spec.rb b/spec/lib/completions/endpoints/gemini_spec.rb index 1421e0f7..4217f0e9 100644 --- a/spec/lib/completions/endpoints/gemini_spec.rb +++ b/spec/lib/completions/endpoints/gemini_spec.rb @@ -179,6 +179,40 @@ RSpec.describe DiscourseAi::Completions::Endpoints::Gemini do expect(parsed.dig(:generationConfig, :thinkingConfig)).to eq({ thinkingBudget: 10_000 }) end + it "correctly handles max output tokens" do + model.update!(max_output_tokens: 1000) + + response = gemini_mock.response("some response mode").to_json + + req_body = nil + + llm = DiscourseAi::Completions::Llm.proxy("custom:#{model.id}") + url = "#{model.url}:generateContent?key=123" + + stub_request(:post, url).with( + body: + proc do |_req_body| + req_body = _req_body + true + end, + ).to_return(status: 200, body: response) + + response = llm.generate("Hello", user: user, max_tokens: 10_000) + parsed = JSON.parse(req_body, symbolize_names: true) + + expect(parsed.dig(:generationConfig, :maxOutputTokens)).to eq(1000) + + response = llm.generate("Hello", user: user, max_tokens: 50) + parsed = JSON.parse(req_body, symbolize_names: true) + + expect(parsed.dig(:generationConfig, :maxOutputTokens)).to eq(50) + + response = llm.generate("Hello", user: user) + parsed = JSON.parse(req_body, symbolize_names: true) + + expect(parsed.dig(:generationConfig, :maxOutputTokens)).to eq(1000) + end + it "clamps thinking tokens within allowed limits" do model.update!(provider_params: { enable_thinking: "true", thinking_tokens: "30000" }) @@ -551,7 +585,7 @@ RSpec.describe DiscourseAi::Completions::Endpoints::Gemini do data: {"candidates": [{"content": {"parts": [{"text": "\\","}],"role": "model"},"finishReason": "STOP"}],"usageMetadata": {"promptTokenCount": 399,"candidatesTokenCount": 191,"totalTokenCount": 590},"modelVersion": "gemini-1.5-pro-002"} data: {"candidates": [{"content": {"parts": [{"text": "\\""}],"role": "model"}}],"usageMetadata": {"promptTokenCount": 399,"totalTokenCount": 399},"modelVersion": "gemini-1.5-pro-002"} - + data: {"candidates": [{"content": {"parts": [{"text": "num"}],"role": "model"},"finishReason": "STOP"}],"usageMetadata": {"promptTokenCount": 399,"candidatesTokenCount": 191,"totalTokenCount": 590},"modelVersion": "gemini-1.5-pro-002"} data: {"candidates": [{"content": {"parts": [{"text": "\\":"}],"role": "model"},"safetyRatings": [{"category": "HARM_CATEGORY_HATE_SPEECH","probability": "NEGLIGIBLE"},{"category": "HARM_CATEGORY_DANGEROUS_CONTENT","probability": "NEGLIGIBLE"},{"category": "HARM_CATEGORY_HARASSMENT","probability": "NEGLIGIBLE"},{"category": "HARM_CATEGORY_SEXUALLY_EXPLICIT","probability": "NEGLIGIBLE"}]}],"usageMetadata": {"promptTokenCount": 399,"totalTokenCount": 399},"modelVersion": "gemini-1.5-pro-002"} diff --git a/spec/lib/completions/endpoints/open_ai_spec.rb b/spec/lib/completions/endpoints/open_ai_spec.rb index 524ff50d..1480324a 100644 --- a/spec/lib/completions/endpoints/open_ai_spec.rb +++ b/spec/lib/completions/endpoints/open_ai_spec.rb @@ -173,7 +173,7 @@ RSpec.describe DiscourseAi::Completions::Endpoints::OpenAi do describe "max tokens for reasoning models" do it "uses max_completion_tokens for reasoning models" do - model.update!(name: "o3-mini") + model.update!(name: "o3-mini", max_output_tokens: 999) llm = DiscourseAi::Completions::Llm.proxy("custom:#{model.id}") prompt = DiscourseAi::Completions::Prompt.new( @@ -201,7 +201,13 @@ RSpec.describe DiscourseAi::Completions::Endpoints::OpenAi do llm.generate(prompt, user: user, max_tokens: 1000) { |chunk| result << chunk } expect(result).to eq("hello") - expect(body_parsed["max_completion_tokens"]).to eq(1000) + expect(body_parsed["max_completion_tokens"]).to eq(999) + + llm.generate(prompt, user: user, max_tokens: 100) { |chunk| result << chunk } + expect(body_parsed["max_completion_tokens"]).to eq(100) + + llm.generate(prompt, user: user) { |chunk| result << chunk } + expect(body_parsed["max_completion_tokens"]).to eq(999) end end diff --git a/spec/system/llms/ai_llm_spec.rb b/spec/system/llms/ai_llm_spec.rb index e16d567f..2fdb0339 100644 --- a/spec/system/llms/ai_llm_spec.rb +++ b/spec/system/llms/ai_llm_spec.rb @@ -14,7 +14,7 @@ RSpec.describe "Managing LLM configurations", type: :system, js: true do it "correctly sets defaults" do visit "/admin/plugins/discourse-ai/ai-llms" - find("[data-llm-id='anthropic-claude-3-5-haiku'] button").click() + find("[data-llm-id='anthropic-claude-opus-4-0'] button").click() form.field("api_key").fill_in("abcd") form.field("enabled_chat_bot").toggle form.submit @@ -26,9 +26,9 @@ RSpec.describe "Managing LLM configurations", type: :system, js: true do expect(llm.api_key).to eq("abcd") preset = DiscourseAi::Completions::Llm.presets.find { |p| p[:id] == "anthropic" } - model_preset = preset[:models].find { |m| m[:name] == "claude-3-5-haiku" } + model_preset = preset[:models].find { |m| m[:name] == "claude-opus-4-0" } - expect(llm.name).to eq("claude-3-5-haiku") + expect(llm.name).to eq("claude-opus-4-0") expect(llm.url).to eq(preset[:endpoint]) expect(llm.tokenizer).to eq(preset[:tokenizer].to_s) expect(llm.max_prompt_tokens.to_i).to eq(model_preset[:tokens])