From 9a6aec2cf6554413cb28a08fcea97fda74108ae0 Mon Sep 17 00:00:00 2001
From: Sam <sam.saffron@gmail.com>
Date: Tue, 18 Feb 2025 07:58:54 +1100
Subject: [PATCH] DEV: eval support for tool calls (#1128)

Also fixes anthropic with no params, streaming calls
---
 evals/lib/eval.rb                             | 47 ++++++++++++++++++-
 .../anthropic_message_processor.rb            |  3 +-
 2 files changed, 48 insertions(+), 2 deletions(-)

diff --git a/evals/lib/eval.rb b/evals/lib/eval.rb
index b5db9427..3e632854 100644
--- a/evals/lib/eval.rb
+++ b/evals/lib/eval.rb
@@ -9,7 +9,8 @@ class DiscourseAi::Evals::Eval
               :args,
               :vision,
               :expected_output,
-              :expected_output_regex
+              :expected_output_regex,
+              :expected_tool_call
 
   def initialize(path:)
     @yaml = YAML.load_file(path).symbolize_keys
@@ -24,6 +25,8 @@ class DiscourseAi::Evals::Eval
     @expected_output_regex = @yaml[:expected_output_regex]
     @expected_output_regex =
       Regexp.new(@expected_output_regex, Regexp::MULTILINE) if @expected_output_regex
+    @expected_tool_call = @yaml[:expected_tool_call]
+    @expected_tool_call.symbolize_keys! if @expected_tool_call
 
     @args[:path] = File.expand_path(File.join(File.dirname(path), @args[:path])) if @args&.key?(
       :path,
@@ -39,6 +42,8 @@ class DiscourseAi::Evals::Eval
         pdf_to_text(llm, **args)
       when "image_to_text"
         image_to_text(llm, **args)
+      when "prompt"
+        prompt_call(llm, **args)
       end
 
     if expected_output
@@ -53,6 +58,19 @@ class DiscourseAi::Evals::Eval
       else
         { result: :fail, expected_output: expected_output_regex, actual_output: result }
       end
+    elsif expected_tool_call
+      tool_call = result
+
+      if result.is_a?(Array)
+        tool_call = result.find { |r| r.is_a?(DiscourseAi::Completions::ToolCall) }
+      end
+      if !tool_call.is_a?(DiscourseAi::Completions::ToolCall) ||
+           (tool_call.name != expected_tool_call[:name]) ||
+           (tool_call.parameters != expected_tool_call[:params])
+        { result: :fail, expected_output: expected_tool_call, actual_output: result }
+      else
+        { result: :pass }
+      end
     else
       { result: :unknown, actual_output: result }
     end
@@ -133,4 +151,31 @@ class DiscourseAi::Evals::Eval
   ensure
     upload.destroy if upload
   end
+
+  def prompt_call(llm, system_prompt:, message:, tools: nil, stream: false)
+    if tools
+      tools.each do |tool|
+        tool.symbolize_keys!
+        tool[:parameters].symbolize_keys! if tool[:parameters]
+      end
+    end
+    prompt =
+      DiscourseAi::Completions::Prompt.new(
+        system_prompt,
+        messages: [{ type: :user, content: message }],
+        tools: tools,
+      )
+
+    result = nil
+    if stream
+      result = []
+      llm
+        .llm_model
+        .to_llm
+        .generate(prompt, user: Discourse.system_user) { |partial| result << partial }
+    else
+      result = llm.llm_model.to_llm.generate(prompt, user: Discourse.system_user)
+    end
+    result
+  end
 end
diff --git a/lib/completions/anthropic_message_processor.rb b/lib/completions/anthropic_message_processor.rb
index aed06502..44242b2d 100644
--- a/lib/completions/anthropic_message_processor.rb
+++ b/lib/completions/anthropic_message_processor.rb
@@ -34,7 +34,8 @@ class DiscourseAi::Completions::AnthropicMessageProcessor
     end
 
     def to_tool_call
-      parameters = JSON.parse(raw_json, symbolize_names: true)
+      parameters = {}
+      parameters = JSON.parse(raw_json, symbolize_names: true) if raw_json.present?
       # we dupe to avoid poisoning the original tool call
       @tool_call = @tool_call.dup
       @tool_call.partial = false