diff --git a/config/eval-llms.yml b/config/eval-llms.yml index fe48dd24..7b25730c 100644 --- a/config/eval-llms.yml +++ b/config/eval-llms.yml @@ -39,6 +39,16 @@ llms: max_prompt_tokens: 200000 vision_enabled: true + claude-3.7-sonnet: + display_name: Claude 3.7 Sonnet + name: claude-3-7-sonnet-latest + tokenizer: DiscourseAi::Tokenizer::AnthropicTokenizer + api_key_env: ANTHROPIC_API_KEY + provider: anthropic + url: https://api.anthropic.com/v1/messages + max_prompt_tokens: 200000 + vision_enabled: true + gemini-2.0-flash: display_name: Gemini 2.0 Flash name: gemini-2-0-flash diff --git a/evals/lib/eval.rb b/evals/lib/eval.rb index 4a62e332..693959ec 100644 --- a/evals/lib/eval.rb +++ b/evals/lib/eval.rb @@ -121,7 +121,7 @@ class DiscourseAi::Evals::Eval def judge_result(result) prompt = judge[:prompt].dup prompt.sub!("{{output}}", result) - prompt.sub!("{{input}}", args[:input]) + args.each { |key, value| prompt.sub!("{{#{key}}}", value.to_s) } prompt += <<~SUFFIX @@ -145,7 +145,8 @@ class DiscourseAi::Evals::Eval messages: [{ type: :user, content: prompt }], ) - result = judge_llm.llm_model.to_llm.generate(prompt, user: Discourse.system_user) + result = + judge_llm.llm_model.to_llm.generate(prompt, user: Discourse.system_user, temperature: 0) if rating = result.match(%r{\[RATING\](\d+)\[/RATING\]}) rating = rating[1].to_i @@ -219,7 +220,7 @@ class DiscourseAi::Evals::Eval upload.destroy if upload end - def prompt_call(llm, system_prompt:, message:, tools: nil, stream: false) + def prompt_call(llm, system_prompt:, message:, temperature: nil, tools: nil, stream: false) if tools tools.each do |tool| tool.symbolize_keys! @@ -230,16 +231,19 @@ class DiscourseAi::Evals::Eval DiscourseAi::Completions::Prompt.new( system_prompt, messages: [{ type: :user, content: message }], - tools: tools, ) + prompt.tools = tools if tools + result = nil if stream result = [] llm .llm_model .to_llm - .generate(prompt, user: Discourse.system_user) { |partial| result << partial } + .generate(prompt, user: Discourse.system_user, temperature: temperature) do |partial| + result << partial + end else result = llm.llm_model.to_llm.generate(prompt, user: Discourse.system_user) end