diff --git a/README.md b/README.md index cd7ae38a..6bbc5a8b 100644 --- a/README.md +++ b/README.md @@ -3,3 +3,26 @@ **Plugin Summary** For more information, please see: https://meta.discourse.org/t/discourse-ai/259214?u=falco + +### Evals + +The directory `evals` contains AI evals for the Discourse AI plugin. + +To run them use: + +cd evals +./run --help + +``` +Usage: evals/run [options] + -e, --eval NAME Name of the evaluation to run + --list-models List models + -m, --model NAME Model to evaluate (will eval all models if not specified) + -l, --list List evals +``` + +To run evals you will need to configure API keys in your environment: + +OPENAI_API_KEY=your_openai_api_key +ANTHROPIC_API_KEY=your_anthropic_api_key +GEMINI_API_KEY=your_gemini_api_key diff --git a/app/models/ai_artifact_version.rb b/app/models/ai_artifact_version.rb index 94589a8f..9a7d2b14 100644 --- a/app/models/ai_artifact_version.rb +++ b/app/models/ai_artifact_version.rb @@ -4,6 +4,19 @@ class AiArtifactVersion < ActiveRecord::Base validates :html, length: { maximum: 65_535 } validates :css, length: { maximum: 65_535 } validates :js, length: { maximum: 65_535 } + + # used when generating test cases + def write_to(path) + css_path = "#{path}/main.css" + html_path = "#{path}/main.html" + js_path = "#{path}/main.js" + instructions_path = "#{path}/instructions.txt" + + File.write(css_path, css) + File.write(html_path, html) + File.write(js_path, js) + File.write(instructions_path, change_description) + end end # == Schema Information diff --git a/evals/lib/eval.rb b/evals/lib/eval.rb index 05b90a15..280ab871 100644 --- a/evals/lib/eval.rb +++ b/evals/lib/eval.rb @@ -10,7 +10,17 @@ class DiscourseAi::Evals::Eval :vision, :expected_output, :expected_output_regex, - :expected_tool_call + :expected_tool_call, + :judge + + class EvalError < StandardError + attr_reader :context + + def initialize(message, context) + super(message) + @context = context + end + end def initialize(path:) @yaml = YAML.load_file(path).symbolize_keys @@ -27,10 +37,14 @@ class DiscourseAi::Evals::Eval Regexp.new(@expected_output_regex, Regexp::MULTILINE) if @expected_output_regex @expected_tool_call = @yaml[:expected_tool_call] @expected_tool_call.symbolize_keys! if @expected_tool_call + @judge = @yaml[:judge] + @judge.symbolize_keys! if @judge - @args[:path] = File.expand_path(File.join(File.dirname(path), @args[:path])) if @args&.key?( - :path, - ) + @args.each do |key, value| + if (key.to_s.include?("_path") || key.to_s == "path") && value.is_a?(String) + @args[key] = File.expand_path(File.join(File.dirname(path), value)) + end + end end def run(llm:) @@ -44,6 +58,8 @@ class DiscourseAi::Evals::Eval image_to_text(llm, **args) when "prompt" prompt_call(llm, **args) + when "edit_artifact" + edit_artifact(llm, **args) end if expected_output @@ -53,7 +69,7 @@ class DiscourseAi::Evals::Eval { result: :fail, expected_output: expected_output, actual_output: result } end elsif expected_output_regex - if result.match?(expected_output_regex) + if result.to_s.match?(expected_output_regex) { result: :pass } else { result: :fail, expected_output: expected_output_regex, actual_output: result } @@ -71,9 +87,13 @@ class DiscourseAi::Evals::Eval else { result: :pass } end + elsif judge + judge_result(result) else - { result: :unknown, actual_output: result } + { result: :pass } end + rescue EvalError => e + { result: :fail, message: e.message, context: e.context } end def print @@ -96,14 +116,68 @@ class DiscourseAi::Evals::Eval private - def helper(llm, input:, name:) + def judge_result(result) + prompt = judge[:prompt].dup + prompt.sub!("{{output}}", result) + prompt.sub!("{{input}}", args[:input]) + + prompt += <<~SUFFIX + + Reply with a rating from 1 to 10, where 10 is perfect and 1 is terrible. + + example output: + + [RATING]10[/RATING] perfect output + + example output: + + [RATING]5[/RATING] + + the following failed to preserve... etc... + SUFFIX + + judge_llm = DiscourseAi::Evals::Llm.choose(judge[:llm]).first + + DiscourseAi::Completions::Prompt.new( + "You are an expert judge tasked at testing LLM outputs.", + messages: [{ type: :user, content: prompt }], + ) + + result = judge_llm.llm_model.to_llm.generate(prompt, user: Discourse.system_user) + + if rating = result.match(%r{\[RATING\](\d+)\[/RATING\]}) + rating = rating[1].to_i + end + + if rating.to_i >= judge[:pass_rating] + { result: :pass } + else + { + result: :fail, + message: "LLM Rating below threshold, it was #{rating}, expecting #{judge[:pass_rating]}", + context: result, + } + end + end + + def helper(llm, input:, name:, locale: nil) completion_prompt = CompletionPrompt.find_by(name: name) helper = DiscourseAi::AiHelper::Assistant.new(helper_llm: llm.llm_proxy) + user = Discourse.system_user + if locale + user = User.new + class << user + attr_accessor :effective_locale + end + + user.effective_locale = locale + user.admin = true + end result = helper.generate_and_send_prompt( completion_prompt, input, - current_user = Discourse.system_user, + current_user = user, _force_default_locale = false, ) @@ -169,4 +243,73 @@ class DiscourseAi::Evals::Eval end result end + + def edit_artifact(llm, css_path:, js_path:, html_path:, instructions_path:) + css = File.read(css_path) + js = File.read(js_path) + html = File.read(html_path) + instructions = File.read(instructions_path) + artifact = + AiArtifact.create!( + css: css, + js: js, + html: html, + user_id: Discourse.system_user.id, + post_id: 1, + name: "eval artifact", + ) + + post = Post.new(topic_id: 1, id: 1) + diff = + DiscourseAi::AiBot::ArtifactUpdateStrategies::Diff.new( + llm: llm.llm_model.to_llm, + post: post, + user: Discourse.system_user, + artifact: artifact, + artifact_version: nil, + instructions: instructions, + ) + diff.apply + + if diff.failed_searches.present? + puts "Eval Errors encountered" + p diff.failed_searches + raise EvalError.new("Failed to apply all changes", diff.failed_searches) + end + + version = artifact.versions.last + raise EvalError.new("Invalid JS", version.js) if !valid_javascript?(version.js) + + output = { css: version.css, js: version.js, html: version.html } + + artifact.destroy + output + end + + def valid_javascript?(str) + require "open3" + + # Create a temporary file with the JavaScript code + Tempfile.create(%w[test .js]) do |f| + f.write(str) + f.flush + + File.write("/tmp/test.js", str) + + begin + Discourse::Utils.execute_command( + "node", + "--check", + f.path, + failure_message: "Invalid JavaScript syntax", + timeout: 30, # reasonable timeout in seconds + ) + true + rescue Discourse::Utils::CommandError + false + end + end + rescue StandardError + false + end end diff --git a/evals/lib/runner.rb b/evals/lib/runner.rb index 87cc153e..86fa34b2 100644 --- a/evals/lib/runner.rb +++ b/evals/lib/runner.rb @@ -155,9 +155,18 @@ class DiscourseAi::Evals::Runner if result[:result] == :fail puts "Failed 🔴" - puts "---- Expected ----\n#{result[:expected_output]}" - puts "---- Actual ----\n#{result[:actual_output]}" + puts "Error: #{result[:message]}" if result[:message] + # this is deliberate, it creates a lot of noise, but sometimes for debugging it's useful + #puts "Context: #{result[:context].to_s[0..2000]}" if result[:context] + if result[:expected_output] && result[:actual_output] + puts "---- Expected ----\n#{result[:expected_output]}" + puts "---- Actual ----\n#{result[:actual_output]}" + end logger.error("Evaluation failed with LLM: #{llm.name}") + logger.error("Error: #{result[:message]}") if result[:message] + logger.error("Expected: #{result[:expected_output]}") if result[:expected_output] + logger.error("Actual: #{result[:actual_output]}") if result[:actual_output] + logger.error("Context: #{result[:context]}") if result[:context] elsif result[:result] == :pass puts "Passed 🟢" logger.info("Evaluation passed with LLM: #{llm.name}") diff --git a/lib/ai_bot/artifact_update_strategies/diff.rb b/lib/ai_bot/artifact_update_strategies/diff.rb index fc2c2345..4a0ff28d 100644 --- a/lib/ai_bot/artifact_update_strategies/diff.rb +++ b/lib/ai_bot/artifact_update_strategies/diff.rb @@ -3,8 +3,15 @@ module DiscourseAi module AiBot module ArtifactUpdateStrategies class Diff < Base + attr_reader :failed_searches + private + def initialize(**kwargs) + super + @failed_searches = [] + end + def build_prompt DiscourseAi::Completions::Prompt.new( system_prompt, @@ -51,15 +58,21 @@ module DiscourseAi content = source.public_send(section == :javascript ? :js : section) blocks.each do |block| begin - content = - DiscourseAi::Utils::DiffUtils::SimpleDiff.apply( - content, - block[:search], - block[:replace], - ) + if !block[:search] + content = block[:replace] + else + content = + DiscourseAi::Utils::DiffUtils::SimpleDiff.apply( + content, + block[:search], + block[:replace], + ) + end rescue DiscourseAi::Utils::DiffUtils::SimpleDiff::NoMatchError + @failed_searches << { section: section, search: block[:search] } # TODO, we may need to inform caller here, LLM made a mistake which it # should correct + puts "Failed to find search: #{block[:search]}" end end updated_content[section == :javascript ? :js : section] = content @@ -76,7 +89,8 @@ module DiscourseAi private def extract_search_replace_blocks(content) - return nil if content.blank? + return nil if content.blank? || content.to_s.strip.downcase.match?(/^\(?no changes?\)?$/m) + return [{ replace: content }] if !content.match?(/<<+\s*SEARCH/) blocks = [] remaining = content @@ -98,29 +112,35 @@ module DiscourseAi 1. Use EXACTLY this format for changes: <<<<<<< SEARCH - (exact code to find) + (first line of code to replace) + (other lines of code to avoid ambiguity) + (last line of code to replace) ======= (replacement code) >>>>>>> REPLACE 2. DO NOT modify the markers or add spaces around them 3. DO NOT add explanations or comments within sections 4. ONLY include [HTML], [CSS], and [JavaScript] sections if they have changes - 5. Ensure search text matches EXACTLY - partial matches will fail - 6. Keep changes minimal and focused - 7. HTML should not include , , or tags, it is injected into a template + 5. HTML should not include , , or tags, it is injected into a template + 6. When specifying a SEARCH block, ALWAYS keep it 8 lines or less, you will be interrupted and a retry will be required if you exceed this limit + 7. NEVER EVER ask followup questions, ALL changes must be performed in a single response, you are consumed via an API, there is no opportunity for humans in the loop + 8. When performing a non-contiguous search, ALWAYS use ... to denote the skipped lines + 9. Be mindful that ... non-contiguous search is not greedy, the following line will only match the first occurrence of the search block + 10. Never mix a full section replacement with a search/replace block in the same section + 11. ALWAYS skip sections you to not want to change, do not include them in the response JavaScript libraries must be sourced from the following CDNs, otherwise CSP will reject it: #{AiArtifact::ALLOWED_CDN_SOURCES.join("\n")} Reply Format: [HTML] - (changes or empty if no changes) + (changes or empty if no changes or entire HTML) [/HTML] [CSS] - (changes or empty if no changes) + (changes or empty if no changes or entire CSS) [/CSS] [JavaScript] - (changes or empty if no changes) + (changes or empty if no changes or entire JavaScript) [/JavaScript] Example - Multiple changes in one file: @@ -152,6 +172,68 @@ module DiscourseAi .text { font-size: 16px; } >>>>>>> REPLACE [/CSS] + + Example - Non contiguous search in CSS (replace most CSS with new CSS) + + Original CSS: + + [CSS] + body { + color: red; + } + .button { + color: blue; + } + .alert { + background-color: green; + } + .alert2 { + background-color: green; + } + [/CSS] + + [CSS] + <<<<<<< SEARCH + body { + ... + background-color: green; + } + ======= + body { + color: red; + } + >>>>>>> REPLACE + + RESULT: + + [CSS] + body { + color: red; + } + .alert2 { + background-color: green; + } + [/CSS] + + Example - full HTML replacement: + + [HTML] +
something old
+
another somethin old
+ [/HTML] + + output: + + [HTML] +
something new
+ [/HTML] + + result: + [HTML] +
something new
+ [/HTML] + + PROMPT end diff --git a/lib/utils/diff_utils/simple_diff.rb b/lib/utils/diff_utils/simple_diff.rb index e03dfef3..202ea881 100644 --- a/lib/utils/diff_utils/simple_diff.rb +++ b/lib/utils/diff_utils/simple_diff.rb @@ -26,6 +26,8 @@ module DiscourseAi lines = content.split("\n") search_lines = search.split("\n") + ### TODO implement me + # 1. Try exact matching match_positions = find_matches(lines, search_lines) { |line, search_line| line == search_line } @@ -38,7 +40,17 @@ module DiscourseAi end end - # 3. Try fuzzy matching + # 3. Try non-contiguous line based stripped matching + if match_positions.empty? + if range = non_contiguous_match_range(lines, search_lines) + first_match, last_match = range + lines.slice!(first_match, last_match - first_match + 1) + lines.insert(first_match, *replace.split("\n")) + return lines.join("\n") + end + end + + # 4. Try fuzzy matching if match_positions.empty? match_positions = find_matches(lines, search_lines) do |line, search_line| @@ -46,7 +58,7 @@ module DiscourseAi end end - # 4. Try block matching as last resort + # 5. Try block matching as last resort if match_positions.empty? if block_matches = find_block_matches(content, search) return replace_blocks(content, block_matches, replace) @@ -68,6 +80,27 @@ module DiscourseAi private + def non_contiguous_match_range(lines, search_lines) + first_idx = nil + last_idx = nil + search_index = 0 + + lines.each_with_index do |line, idx| + if search_lines[search_index].strip == "..." + search_index += 1 + break if search_lines[search_index].nil? + end + if line.strip == search_lines[search_index].strip + first_idx ||= idx + last_idx = idx + search_index += 1 + return first_idx, last_idx if search_index == search_lines.length + end + end + + nil + end + def find_matches(lines, search_lines) matches = [] max_index = lines.length - search_lines.length diff --git a/spec/fabricators/ai_artifact_fabricator.rb b/spec/fabricators/ai_artifact_fabricator.rb new file mode 100644 index 00000000..49ce9ee2 --- /dev/null +++ b/spec/fabricators/ai_artifact_fabricator.rb @@ -0,0 +1,19 @@ +# frozen_string_literal: true +Fabricator(:ai_artifact) do + user + post + name { sequence(:name) { |i| "artifact_#{i}" } } + html { "
Test Content
" } + css { ".test { color: blue; }" } + js { "console.log('test');" } + metadata { { public: false } } +end + +Fabricator(:ai_artifact_version) do + ai_artifact + version_number { sequence(:version_number) { |i| i } } + html { "
Version Content
" } + css { ".version { color: red; }" } + js { "console.log('version');" } + change_description { "Test change" } +end diff --git a/spec/lib/modules/ai_bot/artifact_update_strategies/diff_spec.rb b/spec/lib/modules/ai_bot/artifact_update_strategies/diff_spec.rb new file mode 100644 index 00000000..c2426ad4 --- /dev/null +++ b/spec/lib/modules/ai_bot/artifact_update_strategies/diff_spec.rb @@ -0,0 +1,214 @@ +# frozen_string_literal: true + +RSpec.describe DiscourseAi::AiBot::ArtifactUpdateStrategies::Diff do + fab!(:user) + fab!(:post) + fab!(:artifact) { Fabricate(:ai_artifact) } + fab!(:llm_model) + + let(:llm) { llm_model.to_llm } + let(:instructions) { "Update the button color to red" } + + let(:strategy) do + described_class.new( + llm: llm, + post: post, + user: user, + artifact: artifact, + artifact_version: nil, + instructions: instructions, + ) + end + + describe "#apply" do + it "processes simple search/replace blocks" do + original_css = ".button { color: blue; }" + artifact.update!(css: original_css) + + response = <<~RESPONSE + [CSS] + <<<<<<< SEARCH + .button { color: blue; } + ======= + .button { color: red; } + >>>>>>> REPLACE + [/CSS] + RESPONSE + + DiscourseAi::Completions::Llm.with_prepared_responses([response]) { strategy.apply } + + expect(artifact.versions.last.css).to eq(".button { color: red; }") + end + + it "handles multiple search/replace blocks in the same section" do + original_css = <<~CSS + .button { color: blue; } + .text { font-size: 12px; } + CSS + + artifact.update!(css: original_css) + + response = <<~RESPONSE + [CSS] + <<<<<<< SEARCH + .button { color: blue; } + ======= + .button { color: red; } + >>>>>>> REPLACE + <<<<<<< SEARCH + .text { font-size: 12px; } + ======= + .text { font-size: 16px; } + >>>>>>> REPLACE + [/CSS] + RESPONSE + + DiscourseAi::Completions::Llm.with_prepared_responses([response]) { strategy.apply } + + expected = <<~CSS.strip + .button { color: red; } + .text { font-size: 16px; } + CSS + + expect(artifact.versions.last.css.strip).to eq(expected.strip) + end + + it "handles non-contiguous search/replace using ..." do + original_css = <<~CSS + body { + color: red; + } + .button { + color: blue; + } + .alert { + background-color: green; + } + CSS + + artifact.update!(css: original_css) + + response = <<~RESPONSE + [CSS] + <<<<<<< SEARCH + body { + ... + background-color: green; + } + ======= + body { + color: red; + } + >>>>>>> REPLACE + [/CSS] + RESPONSE + + DiscourseAi::Completions::Llm.with_prepared_responses([response]) { strategy.apply } + + expect(artifact.versions.last.css).to eq("body {\n color: red;\n}") + end + + it "tracks failed searches" do + original_css = ".button { color: blue; }" + artifact.update!(css: original_css) + + response = <<~RESPONSE + [CSS] + <<<<<<< SEARCH + .button { color: green; } + ======= + .button { color: red; } + >>>>>>> REPLACE + [/CSS] + RESPONSE + + DiscourseAi::Completions::Llm.with_prepared_responses([response]) { strategy.apply } + + expect(strategy.failed_searches).to contain_exactly( + { section: :css, search: ".button { color: green; }" }, + ) + expect(artifact.versions.last.css).to eq(original_css) + end + + it "handles complete section replacements" do + original_html = "
old content
" + artifact.update!(html: original_html) + + response = <<~RESPONSE + [HTML] +
new content
+ [/HTML] + RESPONSE + + DiscourseAi::Completions::Llm.with_prepared_responses([response]) { strategy.apply } + + expect(artifact.versions.last.html.strip).to eq("
new content
") + end + + it "ignores empty or 'no changes' sections part 1" do + original = { + html: "
content
", + css: ".button { color: blue; }", + js: "console.log('test');", + } + + artifact.update!(html: original[:html], css: original[:css], js: original[:js]) + + response = <<~RESPONSE + [HTML] + no changes + [/HTML] + [CSS] + (NO CHANGES) + [/CSS] + [JavaScript] + <<<<<<< SEARCH + console.log('test'); + ======= + console.log('(no changes)'); + >>>>>>> REPLACE + [/JavaScript] + RESPONSE + + DiscourseAi::Completions::Llm.with_prepared_responses([response]) { strategy.apply } + + version = artifact.versions.last + expect(version.html).to eq(original[:html]) + expect(version.css).to eq(original[:css]) + expect(version.js).to eq("console.log('(no changes)');") + end + + it "ignores empty or 'no changes' section part 2" do + original = { + html: "
content
", + css: ".button { color: blue; }", + js: "console.log('test');", + } + + artifact.update!(html: original[:html], css: original[:css], js: original[:js]) + + response = <<~RESPONSE + [HTML] + (no changes) + [/HTML] + [CSS] + + [/CSS] + [JavaScript] + <<<<<<< SEARCH + console.log('test'); + ======= + console.log('updated'); + >>>>>>> REPLACE + [/JavaScript] + RESPONSE + + DiscourseAi::Completions::Llm.with_prepared_responses([response]) { strategy.apply } + + version = artifact.versions.last + expect(version.html).to eq(original[:html]) + expect(version.css).to eq(original[:css]) + expect(version.js).to eq("console.log('updated');") + end + end +end diff --git a/spec/lib/modules/ai_bot/tools/update_artifact_spec.rb b/spec/lib/modules/ai_bot/tools/update_artifact_spec.rb index 17833a0b..c9834d25 100644 --- a/spec/lib/modules/ai_bot/tools/update_artifact_spec.rb +++ b/spec/lib/modules/ai_bot/tools/update_artifact_spec.rb @@ -262,7 +262,7 @@ RSpec.describe DiscourseAi::AiBot::Tools::UpdateArtifact do [/CSS] [JavaScript] - nothing to do + no changes [/JavaScript] LLMs like to say nonsense that we can ignore here as well diff --git a/spec/lib/utils/diff_utils/simple_diff_spec.rb b/spec/lib/utils/diff_utils/simple_diff_spec.rb index 8a29028f..fe54e20b 100644 --- a/spec/lib/utils/diff_utils/simple_diff_spec.rb +++ b/spec/lib/utils/diff_utils/simple_diff_spec.rb @@ -171,5 +171,33 @@ RSpec.describe DiscourseAi::Utils::DiffUtils::SimpleDiff do expect(subject.apply(content, search, replace).strip).to eq(expected.strip) end + + it "handles missing lines in search" do + original = <<~TEXT + line1 + line2 + line3 + line4 + line5 + line1 + line2 + TEXT + + search = <<~TEXT + line1 + ... + line3 + ... + line1 + TEXT + + replace = "" + + expected = <<~TEXT + line2 + TEXT + + expect(subject.apply(original, search, replace).strip).to eq(expected.strip) + end end end