diff --git a/README.md b/README.md index cd7ae38a..6bbc5a8b 100644 --- a/README.md +++ b/README.md @@ -3,3 +3,26 @@ **Plugin Summary** For more information, please see: https://meta.discourse.org/t/discourse-ai/259214?u=falco + +### Evals + +The directory `evals` contains AI evals for the Discourse AI plugin. + +To run them use: + +cd evals +./run --help + +``` +Usage: evals/run [options] + -e, --eval NAME Name of the evaluation to run + --list-models List models + -m, --model NAME Model to evaluate (will eval all models if not specified) + -l, --list List evals +``` + +To run evals you will need to configure API keys in your environment: + +OPENAI_API_KEY=your_openai_api_key +ANTHROPIC_API_KEY=your_anthropic_api_key +GEMINI_API_KEY=your_gemini_api_key diff --git a/app/models/ai_artifact_version.rb b/app/models/ai_artifact_version.rb index 94589a8f..9a7d2b14 100644 --- a/app/models/ai_artifact_version.rb +++ b/app/models/ai_artifact_version.rb @@ -4,6 +4,19 @@ class AiArtifactVersion < ActiveRecord::Base validates :html, length: { maximum: 65_535 } validates :css, length: { maximum: 65_535 } validates :js, length: { maximum: 65_535 } + + # used when generating test cases + def write_to(path) + css_path = "#{path}/main.css" + html_path = "#{path}/main.html" + js_path = "#{path}/main.js" + instructions_path = "#{path}/instructions.txt" + + File.write(css_path, css) + File.write(html_path, html) + File.write(js_path, js) + File.write(instructions_path, change_description) + end end # == Schema Information diff --git a/evals/lib/eval.rb b/evals/lib/eval.rb index 05b90a15..280ab871 100644 --- a/evals/lib/eval.rb +++ b/evals/lib/eval.rb @@ -10,7 +10,17 @@ class DiscourseAi::Evals::Eval :vision, :expected_output, :expected_output_regex, - :expected_tool_call + :expected_tool_call, + :judge + + class EvalError < StandardError + attr_reader :context + + def initialize(message, context) + super(message) + @context = context + end + end def initialize(path:) @yaml = YAML.load_file(path).symbolize_keys @@ -27,10 +37,14 @@ class DiscourseAi::Evals::Eval Regexp.new(@expected_output_regex, Regexp::MULTILINE) if @expected_output_regex @expected_tool_call = @yaml[:expected_tool_call] @expected_tool_call.symbolize_keys! if @expected_tool_call + @judge = @yaml[:judge] + @judge.symbolize_keys! if @judge - @args[:path] = File.expand_path(File.join(File.dirname(path), @args[:path])) if @args&.key?( - :path, - ) + @args.each do |key, value| + if (key.to_s.include?("_path") || key.to_s == "path") && value.is_a?(String) + @args[key] = File.expand_path(File.join(File.dirname(path), value)) + end + end end def run(llm:) @@ -44,6 +58,8 @@ class DiscourseAi::Evals::Eval image_to_text(llm, **args) when "prompt" prompt_call(llm, **args) + when "edit_artifact" + edit_artifact(llm, **args) end if expected_output @@ -53,7 +69,7 @@ class DiscourseAi::Evals::Eval { result: :fail, expected_output: expected_output, actual_output: result } end elsif expected_output_regex - if result.match?(expected_output_regex) + if result.to_s.match?(expected_output_regex) { result: :pass } else { result: :fail, expected_output: expected_output_regex, actual_output: result } @@ -71,9 +87,13 @@ class DiscourseAi::Evals::Eval else { result: :pass } end + elsif judge + judge_result(result) else - { result: :unknown, actual_output: result } + { result: :pass } end + rescue EvalError => e + { result: :fail, message: e.message, context: e.context } end def print @@ -96,14 +116,68 @@ class DiscourseAi::Evals::Eval private - def helper(llm, input:, name:) + def judge_result(result) + prompt = judge[:prompt].dup + prompt.sub!("{{output}}", result) + prompt.sub!("{{input}}", args[:input]) + + prompt += <<~SUFFIX + + Reply with a rating from 1 to 10, where 10 is perfect and 1 is terrible. + + example output: + + [RATING]10[/RATING] perfect output + + example output: + + [RATING]5[/RATING] + + the following failed to preserve... etc... + SUFFIX + + judge_llm = DiscourseAi::Evals::Llm.choose(judge[:llm]).first + + DiscourseAi::Completions::Prompt.new( + "You are an expert judge tasked at testing LLM outputs.", + messages: [{ type: :user, content: prompt }], + ) + + result = judge_llm.llm_model.to_llm.generate(prompt, user: Discourse.system_user) + + if rating = result.match(%r{\[RATING\](\d+)\[/RATING\]}) + rating = rating[1].to_i + end + + if rating.to_i >= judge[:pass_rating] + { result: :pass } + else + { + result: :fail, + message: "LLM Rating below threshold, it was #{rating}, expecting #{judge[:pass_rating]}", + context: result, + } + end + end + + def helper(llm, input:, name:, locale: nil) completion_prompt = CompletionPrompt.find_by(name: name) helper = DiscourseAi::AiHelper::Assistant.new(helper_llm: llm.llm_proxy) + user = Discourse.system_user + if locale + user = User.new + class << user + attr_accessor :effective_locale + end + + user.effective_locale = locale + user.admin = true + end result = helper.generate_and_send_prompt( completion_prompt, input, - current_user = Discourse.system_user, + current_user = user, _force_default_locale = false, ) @@ -169,4 +243,73 @@ class DiscourseAi::Evals::Eval end result end + + def edit_artifact(llm, css_path:, js_path:, html_path:, instructions_path:) + css = File.read(css_path) + js = File.read(js_path) + html = File.read(html_path) + instructions = File.read(instructions_path) + artifact = + AiArtifact.create!( + css: css, + js: js, + html: html, + user_id: Discourse.system_user.id, + post_id: 1, + name: "eval artifact", + ) + + post = Post.new(topic_id: 1, id: 1) + diff = + DiscourseAi::AiBot::ArtifactUpdateStrategies::Diff.new( + llm: llm.llm_model.to_llm, + post: post, + user: Discourse.system_user, + artifact: artifact, + artifact_version: nil, + instructions: instructions, + ) + diff.apply + + if diff.failed_searches.present? + puts "Eval Errors encountered" + p diff.failed_searches + raise EvalError.new("Failed to apply all changes", diff.failed_searches) + end + + version = artifact.versions.last + raise EvalError.new("Invalid JS", version.js) if !valid_javascript?(version.js) + + output = { css: version.css, js: version.js, html: version.html } + + artifact.destroy + output + end + + def valid_javascript?(str) + require "open3" + + # Create a temporary file with the JavaScript code + Tempfile.create(%w[test .js]) do |f| + f.write(str) + f.flush + + File.write("/tmp/test.js", str) + + begin + Discourse::Utils.execute_command( + "node", + "--check", + f.path, + failure_message: "Invalid JavaScript syntax", + timeout: 30, # reasonable timeout in seconds + ) + true + rescue Discourse::Utils::CommandError + false + end + end + rescue StandardError + false + end end diff --git a/evals/lib/runner.rb b/evals/lib/runner.rb index 87cc153e..86fa34b2 100644 --- a/evals/lib/runner.rb +++ b/evals/lib/runner.rb @@ -155,9 +155,18 @@ class DiscourseAi::Evals::Runner if result[:result] == :fail puts "Failed 🔴" - puts "---- Expected ----\n#{result[:expected_output]}" - puts "---- Actual ----\n#{result[:actual_output]}" + puts "Error: #{result[:message]}" if result[:message] + # this is deliberate, it creates a lot of noise, but sometimes for debugging it's useful + #puts "Context: #{result[:context].to_s[0..2000]}" if result[:context] + if result[:expected_output] && result[:actual_output] + puts "---- Expected ----\n#{result[:expected_output]}" + puts "---- Actual ----\n#{result[:actual_output]}" + end logger.error("Evaluation failed with LLM: #{llm.name}") + logger.error("Error: #{result[:message]}") if result[:message] + logger.error("Expected: #{result[:expected_output]}") if result[:expected_output] + logger.error("Actual: #{result[:actual_output]}") if result[:actual_output] + logger.error("Context: #{result[:context]}") if result[:context] elsif result[:result] == :pass puts "Passed 🟢" logger.info("Evaluation passed with LLM: #{llm.name}") diff --git a/lib/ai_bot/artifact_update_strategies/diff.rb b/lib/ai_bot/artifact_update_strategies/diff.rb index fc2c2345..4a0ff28d 100644 --- a/lib/ai_bot/artifact_update_strategies/diff.rb +++ b/lib/ai_bot/artifact_update_strategies/diff.rb @@ -3,8 +3,15 @@ module DiscourseAi module AiBot module ArtifactUpdateStrategies class Diff < Base + attr_reader :failed_searches + private + def initialize(**kwargs) + super + @failed_searches = [] + end + def build_prompt DiscourseAi::Completions::Prompt.new( system_prompt, @@ -51,15 +58,21 @@ module DiscourseAi content = source.public_send(section == :javascript ? :js : section) blocks.each do |block| begin - content = - DiscourseAi::Utils::DiffUtils::SimpleDiff.apply( - content, - block[:search], - block[:replace], - ) + if !block[:search] + content = block[:replace] + else + content = + DiscourseAi::Utils::DiffUtils::SimpleDiff.apply( + content, + block[:search], + block[:replace], + ) + end rescue DiscourseAi::Utils::DiffUtils::SimpleDiff::NoMatchError + @failed_searches << { section: section, search: block[:search] } # TODO, we may need to inform caller here, LLM made a mistake which it # should correct + puts "Failed to find search: #{block[:search]}" end end updated_content[section == :javascript ? :js : section] = content @@ -76,7 +89,8 @@ module DiscourseAi private def extract_search_replace_blocks(content) - return nil if content.blank? + return nil if content.blank? || content.to_s.strip.downcase.match?(/^\(?no changes?\)?$/m) + return [{ replace: content }] if !content.match?(/<<+\s*SEARCH/) blocks = [] remaining = content @@ -98,29 +112,35 @@ module DiscourseAi 1. Use EXACTLY this format for changes: <<<<<<< SEARCH - (exact code to find) + (first line of code to replace) + (other lines of code to avoid ambiguity) + (last line of code to replace) ======= (replacement code) >>>>>>> REPLACE 2. DO NOT modify the markers or add spaces around them 3. DO NOT add explanations or comments within sections 4. ONLY include [HTML], [CSS], and [JavaScript] sections if they have changes - 5. Ensure search text matches EXACTLY - partial matches will fail - 6. Keep changes minimal and focused - 7. HTML should not include ,
, or tags, it is injected into a template + 5. HTML should not include , , or tags, it is injected into a template + 6. When specifying a SEARCH block, ALWAYS keep it 8 lines or less, you will be interrupted and a retry will be required if you exceed this limit + 7. NEVER EVER ask followup questions, ALL changes must be performed in a single response, you are consumed via an API, there is no opportunity for humans in the loop + 8. When performing a non-contiguous search, ALWAYS use ... to denote the skipped lines + 9. Be mindful that ... non-contiguous search is not greedy, the following line will only match the first occurrence of the search block + 10. Never mix a full section replacement with a search/replace block in the same section + 11. ALWAYS skip sections you to not want to change, do not include them in the response JavaScript libraries must be sourced from the following CDNs, otherwise CSP will reject it: #{AiArtifact::ALLOWED_CDN_SOURCES.join("\n")} Reply Format: [HTML] - (changes or empty if no changes) + (changes or empty if no changes or entire HTML) [/HTML] [CSS] - (changes or empty if no changes) + (changes or empty if no changes or entire CSS) [/CSS] [JavaScript] - (changes or empty if no changes) + (changes or empty if no changes or entire JavaScript) [/JavaScript] Example - Multiple changes in one file: @@ -152,6 +172,68 @@ module DiscourseAi .text { font-size: 16px; } >>>>>>> REPLACE [/CSS] + + Example - Non contiguous search in CSS (replace most CSS with new CSS) + + Original CSS: + + [CSS] + body { + color: red; + } + .button { + color: blue; + } + .alert { + background-color: green; + } + .alert2 { + background-color: green; + } + [/CSS] + + [CSS] + <<<<<<< SEARCH + body { + ... + background-color: green; + } + ======= + body { + color: red; + } + >>>>>>> REPLACE + + RESULT: + + [CSS] + body { + color: red; + } + .alert2 { + background-color: green; + } + [/CSS] + + Example - full HTML replacement: + + [HTML] +