From 9a6aec2cf6554413cb28a08fcea97fda74108ae0 Mon Sep 17 00:00:00 2001 From: Sam Date: Tue, 18 Feb 2025 07:58:54 +1100 Subject: [PATCH] DEV: eval support for tool calls (#1128) Also fixes anthropic with no params, streaming calls --- evals/lib/eval.rb | 47 ++++++++++++++++++- .../anthropic_message_processor.rb | 3 +- 2 files changed, 48 insertions(+), 2 deletions(-) diff --git a/evals/lib/eval.rb b/evals/lib/eval.rb index b5db9427..3e632854 100644 --- a/evals/lib/eval.rb +++ b/evals/lib/eval.rb @@ -9,7 +9,8 @@ class DiscourseAi::Evals::Eval :args, :vision, :expected_output, - :expected_output_regex + :expected_output_regex, + :expected_tool_call def initialize(path:) @yaml = YAML.load_file(path).symbolize_keys @@ -24,6 +25,8 @@ class DiscourseAi::Evals::Eval @expected_output_regex = @yaml[:expected_output_regex] @expected_output_regex = Regexp.new(@expected_output_regex, Regexp::MULTILINE) if @expected_output_regex + @expected_tool_call = @yaml[:expected_tool_call] + @expected_tool_call.symbolize_keys! if @expected_tool_call @args[:path] = File.expand_path(File.join(File.dirname(path), @args[:path])) if @args&.key?( :path, @@ -39,6 +42,8 @@ class DiscourseAi::Evals::Eval pdf_to_text(llm, **args) when "image_to_text" image_to_text(llm, **args) + when "prompt" + prompt_call(llm, **args) end if expected_output @@ -53,6 +58,19 @@ class DiscourseAi::Evals::Eval else { result: :fail, expected_output: expected_output_regex, actual_output: result } end + elsif expected_tool_call + tool_call = result + + if result.is_a?(Array) + tool_call = result.find { |r| r.is_a?(DiscourseAi::Completions::ToolCall) } + end + if !tool_call.is_a?(DiscourseAi::Completions::ToolCall) || + (tool_call.name != expected_tool_call[:name]) || + (tool_call.parameters != expected_tool_call[:params]) + { result: :fail, expected_output: expected_tool_call, actual_output: result } + else + { result: :pass } + end else { result: :unknown, actual_output: result } end @@ -133,4 +151,31 @@ class DiscourseAi::Evals::Eval ensure upload.destroy if upload end + + def prompt_call(llm, system_prompt:, message:, tools: nil, stream: false) + if tools + tools.each do |tool| + tool.symbolize_keys! + tool[:parameters].symbolize_keys! if tool[:parameters] + end + end + prompt = + DiscourseAi::Completions::Prompt.new( + system_prompt, + messages: [{ type: :user, content: message }], + tools: tools, + ) + + result = nil + if stream + result = [] + llm + .llm_model + .to_llm + .generate(prompt, user: Discourse.system_user) { |partial| result << partial } + else + result = llm.llm_model.to_llm.generate(prompt, user: Discourse.system_user) + end + result + end end diff --git a/lib/completions/anthropic_message_processor.rb b/lib/completions/anthropic_message_processor.rb index aed06502..44242b2d 100644 --- a/lib/completions/anthropic_message_processor.rb +++ b/lib/completions/anthropic_message_processor.rb @@ -34,7 +34,8 @@ class DiscourseAi::Completions::AnthropicMessageProcessor end def to_tool_call - parameters = JSON.parse(raw_json, symbolize_names: true) + parameters = {} + parameters = JSON.parse(raw_json, symbolize_names: true) if raw_json.present? # we dupe to avoid poisoning the original tool call @tool_call = @tool_call.dup @tool_call.partial = false