FEATURE: optionally support OpenAI responses API (#1423)

OpenAI ship a new API for completions called "Responses API"

Certain models (o3-pro) require this API.
Additionally certain features are only made available to the new API.

This allow enabling it per LLM.

see: https://platform.openai.com/docs/api-reference/responses
This commit is contained in:
Sam 2025-06-11 17:12:25 +10:00 committed by GitHub
parent 35d62a659b
commit d97307e99b
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
7 changed files with 510 additions and 34 deletions

View File

@ -52,6 +52,7 @@ class LlmModel < ActiveRecord::Base
disable_temperature: :checkbox,
disable_top_p: :checkbox,
disable_streaming: :checkbox,
enable_responses_api: :checkbox,
reasoning_effort: {
type: :enum,
values: %w[default low medium high],

View File

@ -579,6 +579,7 @@ en:
reasoning_tokens: "Number of tokens used for reasoning"
disable_temperature: "Disable temperature (some thinking models don't support temperature)"
disable_top_p: "Disable top P (some thinking models don't support top P)"
enable_responses_api: "Enable responses API (required on certain OpenAI models)"
related_topics:
title: "Related topics"

View File

@ -20,12 +20,19 @@ module DiscourseAi
def embed_user_ids?
return @embed_user_ids if defined?(@embed_user_ids)
@embed_user_ids =
@embed_user_ids = true if responses_api?
@embed_user_ids ||=
prompt.messages.any? do |m|
m[:id] && m[:type] == :user && !m[:id].to_s.match?(VALID_ID_REGEX)
end
end
def responses_api?
return @responses_api if defined?(@responses_api)
@responses_api = llm_model.lookup_custom_param("enable_responses_api")
end
def max_prompt_tokens
# provide a buffer of 120 tokens - our function counting is not
# 100% accurate and getting numbers to align exactly is very hard
@ -51,7 +58,11 @@ module DiscourseAi
if disable_native_tools?
super
else
@tools_dialect ||= DiscourseAi::Completions::Dialects::OpenAiTools.new(prompt.tools)
@tools_dialect ||=
DiscourseAi::Completions::Dialects::OpenAiTools.new(
prompt.tools,
responses_api: responses_api?,
)
end
end
@ -120,7 +131,7 @@ module DiscourseAi
to_encoded_content_array(
content: content_array.flatten,
image_encoder: ->(details) { image_node(details) },
text_encoder: ->(text) { { type: "text", text: text } },
text_encoder: ->(text) { text_node(text) },
allow_vision: vision_support?,
)
@ -136,13 +147,21 @@ module DiscourseAi
end
end
def text_node(text)
if responses_api?
{ type: "input_text", text: text }
else
{ type: "text", text: text }
end
end
def image_node(details)
{
type: "image_url",
image_url: {
url: "data:#{details[:mime_type]};base64,#{details[:base64]}",
},
}
encoded_image = "data:#{details[:mime_type]};base64,#{details[:base64]}"
if responses_api?
{ type: "input_image", image_url: encoded_image }
else
{ type: "image_url", image_url: { url: encoded_image } }
end
end
def per_message_overhead

View File

@ -4,20 +4,32 @@ module DiscourseAi
module Completions
module Dialects
class OpenAiTools
def initialize(tools)
def initialize(tools, responses_api: false)
@responses_api = responses_api
@raw_tools = tools
end
def translated_tools
raw_tools.map do |tool|
{
type: "function",
function: {
if @responses_api
raw_tools.map do |tool|
{
type: "function",
name: tool.name,
description: tool.description,
parameters: tool.parameters_json_schema,
},
}
}
end
else
raw_tools.map do |tool|
{
type: "function",
function: {
name: tool.name,
description: tool.description,
parameters: tool.parameters_json_schema,
},
}
end
end
end
@ -30,20 +42,37 @@ module DiscourseAi
call_details[:arguments] = call_details[:arguments].to_json
call_details[:name] = raw_message[:name]
{
role: "assistant",
content: nil,
tool_calls: [{ type: "function", function: call_details, id: raw_message[:id] }],
}
if @responses_api
{
type: "function_call",
call_id: raw_message[:id],
name: call_details[:name],
arguments: call_details[:arguments],
}
else
{
role: "assistant",
content: nil,
tool_calls: [{ type: "function", function: call_details, id: raw_message[:id] }],
}
end
end
def from_raw_tool(raw_message)
{
role: "tool",
tool_call_id: raw_message[:id],
content: raw_message[:content],
name: raw_message[:name],
}
if @responses_api
{
type: "function_call_output",
call_id: raw_message[:id],
output: raw_message[:content],
}
else
{
role: "tool",
tool_call_id: raw_message[:id],
content: raw_message[:content],
name: raw_message[:name],
}
end
end
private

View File

@ -89,6 +89,7 @@ module DiscourseAi
# We'll fallback to guess this using the tokenizer.
payload[:stream_options] = { include_usage: true } if llm_model.provider == "open_ai"
end
if !xml_tools_enabled?
if dialect.tools.present?
payload[:tools] = dialect.tools
@ -96,19 +97,39 @@ module DiscourseAi
if dialect.tool_choice == :none
payload[:tool_choice] = "none"
else
payload[:tool_choice] = {
type: "function",
function: {
name: dialect.tool_choice,
},
}
if responses_api?
payload[:tool_choice] = { type: "function", name: dialect.tool_choice }
else
payload[:tool_choice] = {
type: "function",
function: {
name: dialect.tool_choice,
},
}
end
end
end
end
end
convert_payload_to_responses_api!(payload) if responses_api?
payload
end
def responses_api?
return @responses_api if defined?(@responses_api)
@responses_api = llm_model.lookup_custom_param("enable_responses_api")
end
def convert_payload_to_responses_api!(payload)
payload[:input] = payload.delete(:messages)
completion_tokens = payload.delete(:max_completion_tokens) || payload.delete(:max_tokens)
payload[:max_output_tokens] = completion_tokens if completion_tokens
# not supported in responses api
payload.delete(:stream_options)
end
def prepare_request(payload)
headers = { "Content-Type" => "application/json" }
api_key = llm_model.api_key
@ -159,7 +180,12 @@ module DiscourseAi
private
def processor
@processor ||= OpenAiMessageProcessor.new(partial_tool_calls: partial_tool_calls)
@processor ||=
if responses_api?
OpenAiResponsesMessageProcessor.new(partial_tool_calls: partial_tool_calls)
else
OpenAiMessageProcessor.new(partial_tool_calls: partial_tool_calls)
end
end
end
end

View File

@ -0,0 +1,160 @@
# frozen_string_literal: true
module DiscourseAi::Completions
class OpenAiResponsesMessageProcessor
attr_reader :prompt_tokens, :completion_tokens, :cached_tokens
def initialize(partial_tool_calls: false)
@tool = nil # currently streaming ToolCall
@tool_arguments = +""
@prompt_tokens = nil
@completion_tokens = nil
@cached_tokens = nil
@partial_tool_calls = partial_tool_calls
@streaming_parser = nil # JsonStreamingTracker, if used
@has_new_data = false
end
# @param json [Hash] full JSON response from responses.create / retrieve
# @return [Array<String,ToolCall>] pieces in the order they were produced
def process_message(json)
result = []
(json[:output] || []).each do |item|
type = item[:type]
case type
when "function_call"
result << build_tool_call_from_item(item)
when "message"
text = extract_text(item)
result << text if text
end
end
update_usage(json)
result
end
# @param json [Hash] a single streamed event, already parsed from ND-JSON
# @return [String, ToolCall, nil] only when a complete chunk is ready
def process_streamed_message(json)
rval = nil
event_type = json[:type] || json["type"]
case event_type
when "response.output_text.delta"
delta = json[:delta] || json["delta"]
rval = delta if !delta.empty?
when "response.output_item.added"
item = json[:item]
if item && item[:type] == "function_call"
handle_tool_stream(:start, item) { |finished| rval = finished }
end
when "response.function_call_arguments.delta"
delta = json[:delta]
handle_tool_stream(:progress, delta) { |finished| rval = finished } if delta
when "response.output_item.done"
item = json[:item]
if item && item[:type] == "function_call"
handle_tool_stream(:done, item) { |finished| rval = finished }
end
end
update_usage(json)
rval
end
# Called by JsonStreamingTracker when partial JSON arguments are parsed
def notify_progress(key, value)
if @tool
@tool.partial = true
@tool.parameters[key.to_sym] = value
@has_new_data = true
end
end
def current_tool_progress
if @has_new_data
@has_new_data = false
@tool
end
end
def finish
rval = []
if @tool
process_arguments
rval << @tool
@tool = nil
end
rval
end
private
def extract_text(message_item)
(message_item[:content] || message_item["content"] || [])
.filter { |c| (c[:type] || c["type"]) == "output_text" }
.map { |c| c[:text] || c["text"] }
.join
end
def build_tool_call_from_item(item)
id = item[:call_id]
name = item[:name]
arguments = item[:arguments] || ""
params = arguments.empty? ? {} : JSON.parse(arguments, symbolize_names: true)
ToolCall.new(id: id, name: name, parameters: params)
end
def handle_tool_stream(event_type, json)
if event_type == :start
start_tool_stream(json)
elsif event_type == :progress
@streaming_parser << json if @streaming_parser
yield current_tool_progress
elsif event_type == :done
@tool_arguments << json[:arguments].to_s
process_arguments
finished = @tool
@tool = nil
yield finished
end
end
def start_tool_stream(data)
# important note... streaming API has both id and call_id
# both seem to work as identifiers, api examples seem to favor call_id
# so I am using it here
id = data[:call_id]
name = data[:name]
@tool_arguments = +""
@tool = ToolCall.new(id: id, name: name)
@streaming_parser = JsonStreamingTracker.new(self) if @partial_tool_calls
end
# Parse accumulated @tool_arguments once we have a complete JSON blob
def process_arguments
return if @tool_arguments.to_s.empty?
parsed = JSON.parse(@tool_arguments, symbolize_names: true)
@tool.parameters = parsed
@tool.partial = false
@tool_arguments = nil
rescue JSON::ParserError
# leave arguments empty; caller can decide how to handle
end
def update_usage(json)
usage = json.dig(:response, :usage)
return if !usage
cached_tokens = usage.dig(:input_tokens_details, :cached_tokens).to_i
@prompt_tokens ||= usage[:input_tokens] - cached_tokens
@completion_tokens ||= usage[:output_tokens]
@cached_tokens ||= cached_tokens
end
end
end

View File

@ -0,0 +1,240 @@
# frozen_string_literal: true
RSpec.describe DiscourseAi::Completions::Endpoints::OpenAi do
subject(:endpoint) { described_class.new(model) }
fab!(:model) do
Fabricate(
:llm_model,
provider: "open_ai",
url: "https://api.openai.com/v1/responses",
provider_params: {
enable_responses_api: true,
},
)
end
let(:prompt_with_tools) do
prompt = DiscourseAi::Completions::Prompt.new("echo: Hello")
prompt.tools = [
DiscourseAi::Completions::ToolDefinition.new(
name: "echo",
description: "Used for testing of llms, will echo the param given to it",
parameters: [
DiscourseAi::Completions::ToolDefinition::ParameterDefinition.from_hash(
{ name: "string", description: "string to echo", type: :string, required: true },
),
],
),
]
prompt
end
it "can perform simple streaming completion" do
response_payload = <<~TEXT
event: response.created
data: {"type":"response.created","sequence_number":0,"response":{"id":"resp_6848d84bee44819d98e5f4f5103562090333bc932679b022","object":"response","created_at":1749604427,"status":"in_progress","background":false,"error":null,"incomplete_details":null,"instructions":null,"max_output_tokens":null,"model":"gpt-4.1-nano-2025-04-14","output":[],"parallel_tool_calls":true,"previous_response_id":null,"reasoning":{"effort":null,"summary":null},"service_tier":"auto","store":true,"temperature":1.0,"text":{"format":{"type":"text"}},"tool_choice":"auto","tools":[],"top_p":1.0,"truncation":"disabled","usage":null,"user":null,"metadata":{}}}
event: response.in_progress
data: {"type":"response.in_progress","sequence_number":1,"response":{"id":"resp_6848d84bee44819d98e5f4f5103562090333bc932679b022","object":"response","created_at":1749604427,"status":"in_progress","background":false,"error":null,"incomplete_details":null,"instructions":null,"max_output_tokens":null,"model":"gpt-4.1-nano-2025-04-14","output":[],"parallel_tool_calls":true,"previous_response_id":null,"reasoning":{"effort":null,"summary":null},"service_tier":"auto","store":true,"temperature":1.0,"text":{"format":{"type":"text"}},"tool_choice":"auto","tools":[],"top_p":1.0,"truncation":"disabled","usage":null,"user":null,"metadata":{}}}
event: response.output_item.added
data: {"type":"response.output_item.added","sequence_number":2,"output_index":0,"item":{"id":"msg_6848d84c3bc8819dace0eadec6e205090333bc932679b022","type":"message","status":"in_progress","content":[],"role":"assistant"}}
event: response.content_part.added
data: {"type":"response.content_part.added","sequence_number":3,"item_id":"msg_6848d84c3bc8819dace0eadec6e205090333bc932679b022","output_index":0,"content_index":0,"part":{"type":"output_text","annotations":[],"text":""}}
event: response.output_text.delta
data: {"type":"response.output_text.delta","sequence_number":4,"item_id":"msg_6848d84c3bc8819dace0eadec6e205090333bc932679b022","output_index":0,"content_index":0,"delta":"Hello"}
event: response.output_text.delta
data: {"type":"response.output_text.delta","sequence_number":5,"item_id":"msg_6848d84c3bc8819dace0eadec6e205090333bc932679b022","output_index":0,"content_index":0,"delta":" "}
event: response.output_text.delta
data: {"type":"response.output_text.delta","sequence_number":5,"item_id":"msg_6848d84c3bc8819dace0eadec6e205090333bc932679b022","output_index":0,"content_index":0,"delta":"World"}
event: response.output_text.done
data: {"type":"response.output_text.done","sequence_number":5,"item_id":"msg_6848d84c3bc8819dace0eadec6e205090333bc932679b022","output_index":0,"content_index":0,"text":"Hello World"}
event: response.content_part.done
data: {"type":"response.content_part.done","sequence_number":6,"item_id":"msg_6848d84c3bc8819dace0eadec6e205090333bc932679b022","output_index":0,"content_index":0,"part":{"type":"output_text","annotations":[],"text":"Hello World"}}
event: response.output_item.done
data: {"type":"response.output_item.done","sequence_number":7,"output_index":0,"item":{"id":"msg_6848d84c3bc8819dace0eadec6e205090333bc932679b022","type":"message","status":"completed","content":[{"type":"output_text","annotations":[],"text":"Hello World"}],"role":"assistant"}}
event: response.completed
data: {"type":"response.completed","sequence_number":8,"response":{"id":"resp_6848d84bee44819d98e5f4f5103562090333bc932679b022","object":"response","created_at":1749604427,"status":"completed","background":false,"error":null,"incomplete_details":null,"instructions":null,"max_output_tokens":null,"model":"gpt-4.1-nano-2025-04-14","output":[{"id":"msg_6848d84c3bc8819dace0eadec6e205090333bc932679b022","type":"message","status":"completed","content":[{"type":"output_text","annotations":[],"text":"Hello"}],"role":"assistant"}],"parallel_tool_calls":true,"previous_response_id":null,"reasoning":{"effort":null,"summary":null},"service_tier":"default","store":true,"temperature":1.0,"text":{"format":{"type":"text"}},"tool_choice":"auto","tools":[],"top_p":1.0,"truncation":"disabled","usage":{"input_tokens":35,"input_tokens_details":{"cached_tokens":5},"output_tokens":9,"output_tokens_details":{"reasoning_tokens":0},"total_tokens":37},"user":null,"metadata":{}}}
TEXT
partials = []
stub_request(:post, "https://api.openai.com/v1/responses").to_return(
status: 200,
body: response_payload,
)
model
.to_llm
.generate("Say: Hello World", user: Discourse.system_user) { |partial| partials << partial }
expect(partials).to eq(["Hello", " ", "World"])
log = AiApiAuditLog.last
# note: our report counts cache and request tokens separately see: DiscourseAi::Completions::Report
expect(log).to be_present
expect(log.request_tokens).to eq(30)
expect(log.response_tokens).to eq(9)
expect(log.cached_tokens).to eq(5)
end
it "can properly stream tool calls" do
response_payload = <<~TEXT
event: response.created
data: {"type":"response.created","sequence_number":0,"response":{"id":"resp_684910c81eec81a3a9222aa336d9fcf202d35c1819a50f63","object":"response","created_at":1749618888,"status":"in_progress","background":false,"error":null,"incomplete_details":null,"instructions":null,"max_output_tokens":null,"model":"gpt-4.1-nano-2025-04-14","output":[],"parallel_tool_calls":true,"previous_response_id":null,"reasoning":{"effort":null,"summary":null},"service_tier":"auto","store":true,"temperature":1.0,"text":{"format":{"type":"text"}},"tool_choice":{"type":"function","name":"echo"},"tools":[{"type":"function","description":"Used for testing of llms, will echo the param given to it","name":"echo","parameters":{"type":"object","properties":{"string":{"type":"string","description":"string to echo"}},"required":["string"]},"strict":true}],"top_p":1.0,"truncation":"disabled","usage":null,"user":null,"metadata":{}}}
event: response.in_progress
data: {"type":"response.in_progress","sequence_number":1,"response":{"id":"resp_684910c81eec81a3a9222aa336d9fcf202d35c1819a50f63","object":"response","created_at":1749618888,"status":"in_progress","background":false,"error":null,"incomplete_details":null,"instructions":null,"max_output_tokens":null,"model":"gpt-4.1-nano-2025-04-14","output":[],"parallel_tool_calls":true,"previous_response_id":null,"reasoning":{"effort":null,"summary":null},"service_tier":"auto","store":true,"temperature":1.0,"text":{"format":{"type":"text"}},"tool_choice":{"type":"function","name":"echo"},"tools":[{"type":"function","description":"Used for testing of llms, will echo the param given to it","name":"echo","parameters":{"type":"object","properties":{"string":{"type":"string","description":"string to echo"}},"required":["string"]},"strict":true}],"top_p":1.0,"truncation":"disabled","usage":null,"user":null,"metadata":{}}}
event: response.output_item.added
data: {"type":"response.output_item.added","sequence_number":2,"output_index":0,"item":{"id":"fc_684910c8b68881a3b43610e1d57ef00702d35c1819a50f63","type":"function_call","status":"in_progress","arguments":"","call_id":"call_TQyfNmFnKblzXl5rlcGeIsg5","name":"echo"}}
event: response.function_call_arguments.delta
data: {"type":"response.function_call_arguments.delta","sequence_number":3,"item_id":"fc_684910c8b68881a3b43610e1d57ef00702d35c1819a50f63","output_index":0,"delta":"{\\""}
event: response.function_call_arguments.delta
data: {"type":"response.function_call_arguments.delta","sequence_number":4,"item_id":"fc_684910c8b68881a3b43610e1d57ef00702d35c1819a50f63","output_index":0,"delta":"string"}
event: response.function_call_arguments.delta
data: {"type":"response.function_call_arguments.delta","sequence_number":5,"item_id":"fc_684910c8b68881a3b43610e1d57ef00702d35c1819a50f63","output_index":0,"delta":"\\":\\""}
event: response.function_call_arguments.delta
data: {"type":"response.function_call_arguments.delta","sequence_number":6,"item_id":"fc_684910c8b68881a3b43610e1d57ef00702d35c1819a50f63","output_index":0,"delta":"hello"}
event: response.function_call_arguments.delta
data: {"type":"response.function_call_arguments.delta","sequence_number":7,"item_id":"fc_684910c8b68881a3b43610e1d57ef00702d35c1819a50f63","output_index":0,"delta":"\\"}"}
event: response.function_call_arguments.done
data: {"type":"response.function_call_arguments.done","sequence_number":8,"item_id":"fc_684910c8b68881a3b43610e1d57ef00702d35c1819a50f63","output_index":0,"arguments":"{\\"string\\":\\"hello\\"}"}
event: response.output_item.done
data: {"type":"response.output_item.done","sequence_number":9,"output_index":0,"item":{"id":"fc_684910c8b68881a3b43610e1d57ef00702d35c1819a50f63","type":"function_call","status":"completed","arguments":"{\\"string\\":\\"hello\\"}","call_id":"call_TQyfNmFnKblzXl5rlcGeIsg5","name":"echo"}}
event: response.completed
data: {"type":"response.completed","sequence_number":10,"response":{"id":"resp_684910c81eec81a3a9222aa336d9fcf202d35c1819a50f63","object":"response","created_at":1749618888,"status":"completed","background":false,"error":null,"incomplete_details":null,"instructions":null,"max_output_tokens":null,"model":"gpt-4.1-nano-2025-04-14","output":[{"id":"fc_684910c8b68881a3b43610e1d57ef00702d35c1819a50f63","type":"function_call","status":"completed","arguments":"{\\"string\\":\\"hello\\"}","call_id":"call_TQyfNmFnKblzXl5rlcGeIsg5","name":"echo"}],"parallel_tool_calls":true,"previous_response_id":null,"reasoning":{"effort":null,"summary":null},"service_tier":"default","store":true,"temperature":1.0,"text":{"format":{"type":"text"}},"tool_choice":{"type":"function","name":"echo"},"tools":[{"type":"function","description":"Used for testing of llms, will echo the param given to it","name":"echo","parameters":{"type":"object","properties":{"string":{"type":"string","description":"string to echo"}},"required":["string"]},"strict":true}],"top_p":1.0,"truncation":"disabled","usage":{"input_tokens":71,"input_tokens_details":{"cached_tokens":0},"output_tokens":6,"output_tokens_details":{"reasoning_tokens":0},"total_tokens":77},"user":null,"metadata":{}}}
TEXT
partials = []
stub_request(:post, "https://api.openai.com/v1/responses").to_return(
status: 200,
body: response_payload,
)
model
.to_llm
.generate(
prompt_with_tools,
user: Discourse.system_user,
partial_tool_calls: true,
) { |partial| partials << partial.dup }
# the partial tools are deduped
expect(partials.length).to eq(1)
expect(partials.first).to be_a(DiscourseAi::Completions::ToolCall)
expect(partials.first.name).to eq("echo")
expect(partials.first.parameters).to eq({ string: "hello" })
expect(partials.first.id).to eq("call_TQyfNmFnKblzXl5rlcGeIsg5")
end
it "can handle non streaming tool calls" do
response_object = {
id: "resp_68491ed72974819f94652a73fb58109c08901d75ebf6c66e",
object: "response",
created_at: 1_749_622_487,
status: "completed",
background: false,
error: nil,
incomplete_details: nil,
instructions: nil,
max_output_tokens: nil,
model: "gpt-4.1-nano-2025-04-14",
output: [
{
id: "fc_68491ed75e0c819f87462ff642c58d2e08901d75ebf6c66e",
type: "function_call",
status: "completed",
arguments: "{\"string\":\"sam\"}",
call_id: "call_UdxBpinIVc5nRZ0VnWJIgneA",
name: "echo",
},
],
parallel_tool_calls: true,
previous_response_id: nil,
reasoning: {
effort: nil,
summary: nil,
},
service_tier: "default",
store: true,
temperature: 1.0,
text: {
format: {
type: "text",
},
},
tool_choice: {
type: "function",
name: "echo",
},
tools: [
{
type: "function",
description: "Used for testing of llms, will echo the param given to it",
name: "echo",
parameters: {
type: "object",
properties: {
string: {
type: "string",
description: "string to echo",
},
},
required: ["string"],
},
strict: true,
},
],
top_p: 1.0,
truncation: "disabled",
usage: {
input_tokens: 73,
input_tokens_details: {
cached_tokens: 0,
},
output_tokens: 6,
output_tokens_details: {
reasoning_tokens: 0,
},
total_tokens: 79,
},
user: nil,
metadata: {
},
}
stub_request(:post, "https://api.openai.com/v1/responses").to_return(
status: 200,
body: response_object.to_json,
)
result = model.to_llm.generate(prompt_with_tools, user: Discourse.system_user)
expect(result).to be_a(DiscourseAi::Completions::ToolCall)
expect(result.name).to eq("echo")
expect(result.parameters).to eq({ string: "sam" })
expect(result.id).to eq("call_UdxBpinIVc5nRZ0VnWJIgneA")
end
end