From 2a62658248f7a4d7c496590691fd51ec1c307701 Mon Sep 17 00:00:00 2001 From: Sam Date: Thu, 8 May 2025 07:39:50 +1000 Subject: [PATCH] FEATURE: support configurable thinking tokens for Gemini (#1322) --- app/models/llm_model.rb | 2 + config/locales/client.en.yml | 2 + lib/completions/endpoints/gemini.rb | 6 ++ spec/lib/completions/endpoints/gemini_spec.rb | 78 +++++++++++++++++++ 4 files changed, 88 insertions(+) diff --git a/app/models/llm_model.rb b/app/models/llm_model.rb index 05c7be4b..5efbbb0f 100644 --- a/app/models/llm_model.rb +++ b/app/models/llm_model.rb @@ -63,6 +63,8 @@ class LlmModel < ActiveRecord::Base }, google: { disable_native_tools: :checkbox, + enable_thinking: :checkbox, + thinking_tokens: :number, }, azure: { disable_native_tools: :checkbox, diff --git a/config/locales/client.en.yml b/config/locales/client.en.yml index 0c2c98d2..04901dc2 100644 --- a/config/locales/client.en.yml +++ b/config/locales/client.en.yml @@ -533,6 +533,8 @@ en: disable_streaming: "Disable streaming completions (convert streaming to non streaming requests)" reasoning_effort: "Reasoning effort (only applicable to reasoning models)" enable_reasoning: "Enable reasoning (only applicable to Sonnet 3.7)" + enable_thinking: "Enable thinking (only on applicable models eg: flash 2.5)" + thinking_tokens: "Number of tokens used for thinking" reasoning_tokens: "Number of tokens used for reasoning" disable_temperature: "Disable temperature (some thinking models don't support temperature)" disable_top_p: "Disable top P (some thinking models don't support top P)" diff --git a/lib/completions/endpoints/gemini.rb b/lib/completions/endpoints/gemini.rb index e52fd46f..025d4fbc 100644 --- a/lib/completions/endpoints/gemini.rb +++ b/lib/completions/endpoints/gemini.rb @@ -94,6 +94,12 @@ module DiscourseAi end end + if llm_model.lookup_custom_param("enable_thinking") + thinking_tokens = llm_model.lookup_custom_param("thinking_tokens").to_i + thinking_tokens = thinking_tokens.clamp(0, 24_576) + payload[:generationConfig][:thinkingConfig] = { thinkingBudget: thinking_tokens } + end + payload end diff --git a/spec/lib/completions/endpoints/gemini_spec.rb b/spec/lib/completions/endpoints/gemini_spec.rb index a4355d92..1cf8ca1d 100644 --- a/spec/lib/completions/endpoints/gemini_spec.rb +++ b/spec/lib/completions/endpoints/gemini_spec.rb @@ -153,6 +153,84 @@ RSpec.describe DiscourseAi::Completions::Endpoints::Gemini do } end + it "correctly configures thinking when enabled" do + model.update!(provider_params: { enable_thinking: "true", thinking_tokens: "10000" }) + + response = gemini_mock.response("Using thinking mode").to_json + + req_body = nil + + llm = DiscourseAi::Completions::Llm.proxy("custom:#{model.id}") + url = "#{model.url}:generateContent?key=123" + + stub_request(:post, url).with( + body: + proc do |_req_body| + req_body = _req_body + true + end, + ).to_return(status: 200, body: response) + + response = llm.generate("Hello", user: user) + + parsed = JSON.parse(req_body, symbolize_names: true) + + # Verify thinking config is properly set with the token limit + expect(parsed.dig(:generationConfig, :thinkingConfig)).to eq({ thinkingBudget: 10_000 }) + end + + it "clamps thinking tokens within allowed limits" do + model.update!(provider_params: { enable_thinking: "true", thinking_tokens: "30000" }) + + response = gemini_mock.response("Thinking tokens clamped").to_json + + req_body = nil + + llm = DiscourseAi::Completions::Llm.proxy("custom:#{model.id}") + url = "#{model.url}:generateContent?key=123" + + stub_request(:post, url).with( + body: + proc do |_req_body| + req_body = _req_body + true + end, + ).to_return(status: 200, body: response) + + response = llm.generate("Hello", user: user) + + parsed = JSON.parse(req_body, symbolize_names: true) + + # Verify thinking tokens are clamped to 24_576 + expect(parsed.dig(:generationConfig, :thinkingConfig)).to eq({ thinkingBudget: 24_576 }) + end + + it "does not add thinking config when disabled" do + model.update!(provider_params: { enable_thinking: false, thinking_tokens: "10000" }) + + response = gemini_mock.response("No thinking mode").to_json + + req_body = nil + + llm = DiscourseAi::Completions::Llm.proxy("custom:#{model.id}") + url = "#{model.url}:generateContent?key=123" + + stub_request(:post, url).with( + body: + proc do |_req_body| + req_body = _req_body + true + end, + ).to_return(status: 200, body: response) + + response = llm.generate("Hello", user: user) + + parsed = JSON.parse(req_body, symbolize_names: true) + + # Verify thinking config is not present + expect(parsed.dig(:generationConfig, :thinkingConfig)).to be_nil + end + # by default gemini is meant to use AUTO mode, however new experimental models # appear to require this to be explicitly set it "Explicitly specifies tool config" do