From 2a62658248f7a4d7c496590691fd51ec1c307701 Mon Sep 17 00:00:00 2001
From: Sam <sam.saffron@gmail.com>
Date: Thu, 8 May 2025 07:39:50 +1000
Subject: [PATCH] FEATURE: support configurable thinking tokens for Gemini
 (#1322)

---
 app/models/llm_model.rb                       |  2 +
 config/locales/client.en.yml                  |  2 +
 lib/completions/endpoints/gemini.rb           |  6 ++
 spec/lib/completions/endpoints/gemini_spec.rb | 78 +++++++++++++++++++
 4 files changed, 88 insertions(+)

diff --git a/app/models/llm_model.rb b/app/models/llm_model.rb
index 05c7be4b..5efbbb0f 100644
--- a/app/models/llm_model.rb
+++ b/app/models/llm_model.rb
@@ -63,6 +63,8 @@ class LlmModel < ActiveRecord::Base
       },
       google: {
         disable_native_tools: :checkbox,
+        enable_thinking: :checkbox,
+        thinking_tokens: :number,
       },
       azure: {
         disable_native_tools: :checkbox,
diff --git a/config/locales/client.en.yml b/config/locales/client.en.yml
index 0c2c98d2..04901dc2 100644
--- a/config/locales/client.en.yml
+++ b/config/locales/client.en.yml
@@ -533,6 +533,8 @@ en:
           disable_streaming: "Disable streaming completions (convert streaming to non streaming requests)"
           reasoning_effort: "Reasoning effort (only applicable to reasoning models)"
           enable_reasoning: "Enable reasoning (only applicable to Sonnet 3.7)"
+          enable_thinking: "Enable thinking (only on applicable models eg: flash 2.5)"
+          thinking_tokens: "Number of tokens used for thinking"
           reasoning_tokens: "Number of tokens used for reasoning"
           disable_temperature: "Disable temperature (some thinking models don't support temperature)"
           disable_top_p: "Disable top P (some thinking models don't support top P)"
diff --git a/lib/completions/endpoints/gemini.rb b/lib/completions/endpoints/gemini.rb
index e52fd46f..025d4fbc 100644
--- a/lib/completions/endpoints/gemini.rb
+++ b/lib/completions/endpoints/gemini.rb
@@ -94,6 +94,12 @@ module DiscourseAi
             end
           end
 
+          if llm_model.lookup_custom_param("enable_thinking")
+            thinking_tokens = llm_model.lookup_custom_param("thinking_tokens").to_i
+            thinking_tokens = thinking_tokens.clamp(0, 24_576)
+            payload[:generationConfig][:thinkingConfig] = { thinkingBudget: thinking_tokens }
+          end
+
           payload
         end
 
diff --git a/spec/lib/completions/endpoints/gemini_spec.rb b/spec/lib/completions/endpoints/gemini_spec.rb
index a4355d92..1cf8ca1d 100644
--- a/spec/lib/completions/endpoints/gemini_spec.rb
+++ b/spec/lib/completions/endpoints/gemini_spec.rb
@@ -153,6 +153,84 @@ RSpec.describe DiscourseAi::Completions::Endpoints::Gemini do
     }
   end
 
+  it "correctly configures thinking when enabled" do
+    model.update!(provider_params: { enable_thinking: "true", thinking_tokens: "10000" })
+
+    response = gemini_mock.response("Using thinking mode").to_json
+
+    req_body = nil
+
+    llm = DiscourseAi::Completions::Llm.proxy("custom:#{model.id}")
+    url = "#{model.url}:generateContent?key=123"
+
+    stub_request(:post, url).with(
+      body:
+        proc do |_req_body|
+          req_body = _req_body
+          true
+        end,
+    ).to_return(status: 200, body: response)
+
+    response = llm.generate("Hello", user: user)
+
+    parsed = JSON.parse(req_body, symbolize_names: true)
+
+    # Verify thinking config is properly set with the token limit
+    expect(parsed.dig(:generationConfig, :thinkingConfig)).to eq({ thinkingBudget: 10_000 })
+  end
+
+  it "clamps thinking tokens within allowed limits" do
+    model.update!(provider_params: { enable_thinking: "true", thinking_tokens: "30000" })
+
+    response = gemini_mock.response("Thinking tokens clamped").to_json
+
+    req_body = nil
+
+    llm = DiscourseAi::Completions::Llm.proxy("custom:#{model.id}")
+    url = "#{model.url}:generateContent?key=123"
+
+    stub_request(:post, url).with(
+      body:
+        proc do |_req_body|
+          req_body = _req_body
+          true
+        end,
+    ).to_return(status: 200, body: response)
+
+    response = llm.generate("Hello", user: user)
+
+    parsed = JSON.parse(req_body, symbolize_names: true)
+
+    # Verify thinking tokens are clamped to 24_576
+    expect(parsed.dig(:generationConfig, :thinkingConfig)).to eq({ thinkingBudget: 24_576 })
+  end
+
+  it "does not add thinking config when disabled" do
+    model.update!(provider_params: { enable_thinking: false, thinking_tokens: "10000" })
+
+    response = gemini_mock.response("No thinking mode").to_json
+
+    req_body = nil
+
+    llm = DiscourseAi::Completions::Llm.proxy("custom:#{model.id}")
+    url = "#{model.url}:generateContent?key=123"
+
+    stub_request(:post, url).with(
+      body:
+        proc do |_req_body|
+          req_body = _req_body
+          true
+        end,
+    ).to_return(status: 200, body: response)
+
+    response = llm.generate("Hello", user: user)
+
+    parsed = JSON.parse(req_body, symbolize_names: true)
+
+    # Verify thinking config is not present
+    expect(parsed.dig(:generationConfig, :thinkingConfig)).to be_nil
+  end
+
   # by default gemini is meant to use AUTO mode, however new experimental models
   # appear to require this to be explicitly set
   it "Explicitly specifies tool config" do