DEV: improve artifact editing and eval system (#1130)

- Add non-contiguous search/replace support using ... syntax - Add judge support for evaluating LLM outputs with ratings - Improve error handling and reporting in eval runner - Add full section replacement support without search blocks - Add fabricators and specs for artifact diffing - Track failed searches to improve debugging - Add JS syntax validation for artifact versions in eval system - Update prompt documentation with clear guidelines * improve eval output * move error handling * llm as a judge * fix spec * small note on evals
2025-06-29 02:52:16 +00:00 · 2025-02-19 15:44:33 +11:00 · 2025-02-19 15:44:33 +11:00 · 0c9466059c
commit 0c9466059c
parent 02f0908963
10 changed files with 591 additions and 27 deletions
--- a/README.md
+++ b/README.md
@ -3,3 +3,26 @@
 **Plugin Summary**
 For more information, please see: https://meta.discourse.org/t/discourse-ai/259214?u=falco
 ### Evals
 The directory `evals` contains AI evals for the Discourse AI plugin.
 To run them use: 
 cd evals
 ./run --help
 ```
 Usage: evals/run [options]
    -e, --eval NAME                  Name of the evaluation to run
        --list-models                List models
    -m, --model NAME                 Model to evaluate (will eval all models if not specified)
    -l, --list                       List evals
 ```
 To run evals you will need to configure API keys in your environment:
 OPENAI_API_KEY=your_openai_api_key
 ANTHROPIC_API_KEY=your_anthropic_api_key
 GEMINI_API_KEY=your_gemini_api_key
--- a/app/models/ai_artifact_version.rb
+++ b/app/models/ai_artifact_version.rb
@ -4,6 +4,19 @@ class AiArtifactVersion < ActiveRecord::Base
  validates :html, length: { maximum: 65_535 }
  validates :css, length: { maximum: 65_535 }
  validates :js, length: { maximum: 65_535 }
  # used when generating test cases
  def write_to(path)
    css_path = "#{path}/main.css"
    html_path = "#{path}/main.html"
    js_path = "#{path}/main.js"
    instructions_path = "#{path}/instructions.txt"
    File.write(css_path, css)
    File.write(html_path, html)
    File.write(js_path, js)
    File.write(instructions_path, change_description)
  end
 end
 # == Schema Information
--- a/evals/lib/eval.rb
+++ b/evals/lib/eval.rb
@ -10,7 +10,17 @@ class DiscourseAi::Evals::Eval
              :vision,
              :expected_output,
              :expected_output_regex,
-              :expected_tool_call
+              :expected_tool_call,
              :judge
  class EvalError < StandardError
    attr_reader :context
    def initialize(message, context)
      super(message)
      @context = context
    end
  end
  def initialize(path:)
    @yaml = YAML.load_file(path).symbolize_keys
@ -27,10 +37,14 @@ class DiscourseAi::Evals::Eval
      Regexp.new(@expected_output_regex, Regexp::MULTILINE) if @expected_output_regex
    @expected_tool_call = @yaml[:expected_tool_call]
    @expected_tool_call.symbolize_keys! if @expected_tool_call
    @judge = @yaml[:judge]
    @judge.symbolize_keys! if @judge
-    @args[:path] = File.expand_path(File.join(File.dirname(path), @args[:path])) if @args&.key?(
+    @args.each do |key, value|
-      :path,
+      if (key.to_s.include?("_path") || key.to_s == "path") && value.is_a?(String)
-    )
+        @args[key] = File.expand_path(File.join(File.dirname(path), value))
      end
    end
  end
  def run(llm:)
@ -44,6 +58,8 @@ class DiscourseAi::Evals::Eval
        image_to_text(llm, **args)
      when "prompt"
        prompt_call(llm, **args)
      when "edit_artifact"
        edit_artifact(llm, **args)
      end
    if expected_output
@ -53,7 +69,7 @@ class DiscourseAi::Evals::Eval
        { result: :fail, expected_output: expected_output, actual_output: result }
      end
    elsif expected_output_regex
-      if result.match?(expected_output_regex)
+      if result.to_s.match?(expected_output_regex)
        { result: :pass }
      else
        { result: :fail, expected_output: expected_output_regex, actual_output: result }
@ -71,9 +87,13 @@ class DiscourseAi::Evals::Eval
      else
        { result: :pass }
      end
    elsif judge
      judge_result(result)
    else
-      { result: :unknown, actual_output: result }
+      { result: :pass }
    end
  rescue EvalError => e
    { result: :fail, message: e.message, context: e.context }
  end
  def print
@ -96,14 +116,68 @@ class DiscourseAi::Evals::Eval
  private
-  def helper(llm, input:, name:)
+  def judge_result(result)
    prompt = judge[:prompt].dup
    prompt.sub!("{{output}}", result)
    prompt.sub!("{{input}}", args[:input])
    prompt += <<~SUFFIX
      Reply with a rating from 1 to 10, where 10 is perfect and 1 is terrible.
      example output:
      [RATING]10[/RATING] perfect output
      example output:
      [RATING]5[/RATING]
      the following failed to preserve... etc...
    SUFFIX
    judge_llm = DiscourseAi::Evals::Llm.choose(judge[:llm]).first
    DiscourseAi::Completions::Prompt.new(
      "You are an expert judge tasked at testing LLM outputs.",
      messages: [{ type: :user, content: prompt }],
    )
    result = judge_llm.llm_model.to_llm.generate(prompt, user: Discourse.system_user)
    if rating = result.match(%r{\[RATING\](\d+)\[/RATING\]})
      rating = rating[1].to_i
    end
    if rating.to_i >= judge[:pass_rating]
      { result: :pass }
    else
      {
        result: :fail,
        message: "LLM Rating below threshold, it was #{rating}, expecting #{judge[:pass_rating]}",
        context: result,
      }
    end
  end
  def helper(llm, input:, name:, locale: nil)
    completion_prompt = CompletionPrompt.find_by(name: name)
    helper = DiscourseAi::AiHelper::Assistant.new(helper_llm: llm.llm_proxy)
    user = Discourse.system_user
    if locale
      user = User.new
      class << user
        attr_accessor :effective_locale
      end
      user.effective_locale = locale
      user.admin = true
    end
    result =
      helper.generate_and_send_prompt(
        completion_prompt,
        input,
-        current_user = Discourse.system_user,
+        current_user = user,
        _force_default_locale = false,
      )
@ -169,4 +243,73 @@ class DiscourseAi::Evals::Eval
    end
    result
  end
  def edit_artifact(llm, css_path:, js_path:, html_path:, instructions_path:)
    css = File.read(css_path)
    js = File.read(js_path)
    html = File.read(html_path)
    instructions = File.read(instructions_path)
    artifact =
      AiArtifact.create!(
        css: css,
        js: js,
        html: html,
        user_id: Discourse.system_user.id,
        post_id: 1,
        name: "eval artifact",
      )
    post = Post.new(topic_id: 1, id: 1)
    diff =
      DiscourseAi::AiBot::ArtifactUpdateStrategies::Diff.new(
        llm: llm.llm_model.to_llm,
        post: post,
        user: Discourse.system_user,
        artifact: artifact,
        artifact_version: nil,
        instructions: instructions,
      )
    diff.apply
    if diff.failed_searches.present?
      puts "Eval Errors encountered"
      p diff.failed_searches
      raise EvalError.new("Failed to apply all changes", diff.failed_searches)
    end
    version = artifact.versions.last
    raise EvalError.new("Invalid JS", version.js) if !valid_javascript?(version.js)
    output = { css: version.css, js: version.js, html: version.html }
    artifact.destroy
    output
  end
  def valid_javascript?(str)
    require "open3"
    # Create a temporary file with the JavaScript code
    Tempfile.create(%w[test .js]) do |f|
      f.write(str)
      f.flush
      File.write("/tmp/test.js", str)
      begin
        Discourse::Utils.execute_command(
          "node",
          "--check",
          f.path,
          failure_message: "Invalid JavaScript syntax",
          timeout: 30, # reasonable timeout in seconds
        )
        true
      rescue Discourse::Utils::CommandError
        false
      end
    end
  rescue StandardError
    false
  end
 end
--- a/evals/lib/runner.rb
+++ b/evals/lib/runner.rb
@ -155,9 +155,18 @@ class DiscourseAi::Evals::Runner
          if result[:result] == :fail
            puts "Failed 🔴"
            puts "Error: #{result[:message]}" if result[:message]
            # this is deliberate, it creates a lot of noise, but sometimes for debugging it's useful
            #puts "Context: #{result[:context].to_s[0..2000]}" if result[:context]
            if result[:expected_output] && result[:actual_output]
              puts "---- Expected ----\n#{result[:expected_output]}"
              puts "---- Actual ----\n#{result[:actual_output]}"
            end
            logger.error("Evaluation failed with LLM: #{llm.name}")
            logger.error("Error: #{result[:message]}") if result[:message]
            logger.error("Expected: #{result[:expected_output]}") if result[:expected_output]
            logger.error("Actual: #{result[:actual_output]}") if result[:actual_output]
            logger.error("Context: #{result[:context]}") if result[:context]
          elsif result[:result] == :pass
            puts "Passed 🟢"
            logger.info("Evaluation passed with LLM: #{llm.name}")
--- a/lib/ai_bot/artifact_update_strategies/diff.rb
+++ b/lib/ai_bot/artifact_update_strategies/diff.rb
@ -3,8 +3,15 @@ module DiscourseAi
  module AiBot
    module ArtifactUpdateStrategies
      class Diff < Base
        attr_reader :failed_searches
        private
        def initialize(**kwargs)
          super
          @failed_searches = []
        end
        def build_prompt
          DiscourseAi::Completions::Prompt.new(
            system_prompt,
@ -51,15 +58,21 @@ module DiscourseAi
            content = source.public_send(section == :javascript ? :js : section)
            blocks.each do |block|
              begin
                if !block[:search]
                  content = block[:replace]
                else
                  content =
                    DiscourseAi::Utils::DiffUtils::SimpleDiff.apply(
                      content,
                      block[:search],
                      block[:replace],
                    )
                end
              rescue DiscourseAi::Utils::DiffUtils::SimpleDiff::NoMatchError
                @failed_searches << { section: section, search: block[:search] }
                # TODO, we may need to inform caller here, LLM made a mistake which it
                # should correct
                puts "Failed to find search: #{block[:search]}"
              end
            end
            updated_content[section == :javascript ? :js : section] = content
@ -76,7 +89,8 @@ module DiscourseAi
        private
        def extract_search_replace_blocks(content)
-          return nil if content.blank?
+          return nil if content.blank? || content.to_s.strip.downcase.match?(/^\(?no changes?\)?$/m)
          return [{ replace: content }] if !content.match?(/<<+\s*SEARCH/)
          blocks = []
          remaining = content
@ -98,29 +112,35 @@ module DiscourseAi
            1. Use EXACTLY this format for changes:
               <<<<<<< SEARCH
-               (exact code to find)
+               (first line of code to replace)
               (other lines of code to avoid ambiguity)
               (last line of code to replace)
               =======
               (replacement code)
               >>>>>>> REPLACE
            2. DO NOT modify the markers or add spaces around them
            3. DO NOT add explanations or comments within sections
            4. ONLY include [HTML], [CSS], and [JavaScript] sections if they have changes
-            5. Ensure search text matches EXACTLY - partial matches will fail
+            5. HTML should not include <html>, <head>, or <body> tags, it is injected into a template
-            6. Keep changes minimal and focused
+            6. When specifying a SEARCH block, ALWAYS keep it 8 lines or less, you will be interrupted and a retry will be required if you exceed this limit
-            7. HTML should not include <html>, <head>, or <body> tags, it is injected into a template
+            7. NEVER EVER ask followup questions, ALL changes must be performed in a single response, you are consumed via an API, there is no opportunity for humans in the loop
            8. When performing a non-contiguous search, ALWAYS use ... to denote the skipped lines
            9. Be mindful that ... non-contiguous search is not greedy, the following line will only match the first occurrence of the search block
            10. Never mix a full section replacement with a search/replace block in the same section
            11. ALWAYS skip sections you to not want to change, do not include them in the response
            JavaScript libraries must be sourced from the following CDNs, otherwise CSP will reject it:
            #{AiArtifact::ALLOWED_CDN_SOURCES.join("\n")}
            Reply Format:
            [HTML]
-            (changes or empty if no changes)
+            (changes or empty if no changes or entire HTML)
            [/HTML]
            [CSS]
-            (changes or empty if no changes)
+            (changes or empty if no changes or entire CSS)
            [/CSS]
            [JavaScript]
-            (changes or empty if no changes)
+            (changes or empty if no changes or entire JavaScript)
            [/JavaScript]
            Example - Multiple changes in one file:
@ -152,6 +172,68 @@ module DiscourseAi
            .text { font-size: 16px; }
            >>>>>>> REPLACE
            [/CSS]
            Example - Non contiguous search in CSS (replace most CSS with new CSS)
            Original CSS:
            [CSS]
            body {
              color: red;
            }
            .button {
              color: blue;
            }
            .alert {
              background-color: green;
            }
            .alert2 {
              background-color: green;
            }
            [/CSS]
            [CSS]
            <<<<<<< SEARCH
            body {
            ...
              background-color: green;
            }
            =======
            body {
              color: red;
            }
            >>>>>>> REPLACE
            RESULT:
            [CSS]
            body {
              color: red;
            }
            .alert2 {
              background-color: green;
            }
            [/CSS]
            Example - full HTML replacement:
            [HTML]
            <div>something old</div>
            <div>another somethin old</div>
            [/HTML]
            output:
            [HTML]
            <div>something new</div>
            [/HTML]
            result:
            [HTML]
            <div>something new</div>
            [/HTML]
          PROMPT
        end
--- a/lib/utils/diff_utils/simple_diff.rb
+++ b/lib/utils/diff_utils/simple_diff.rb
@ -26,6 +26,8 @@ module DiscourseAi
          lines = content.split("\n")
          search_lines = search.split("\n")
          ### TODO implement me
          # 1. Try exact matching
          match_positions =
            find_matches(lines, search_lines) { |line, search_line| line == search_line }
@ -38,7 +40,17 @@ module DiscourseAi
              end
          end
-          # 3. Try fuzzy matching
+          # 3. Try non-contiguous line based stripped matching
          if match_positions.empty?
            if range = non_contiguous_match_range(lines, search_lines)
              first_match, last_match = range
              lines.slice!(first_match, last_match - first_match + 1)
              lines.insert(first_match, *replace.split("\n"))
              return lines.join("\n")
            end
          end
          # 4. Try fuzzy matching
          if match_positions.empty?
            match_positions =
              find_matches(lines, search_lines) do |line, search_line|
@ -46,7 +58,7 @@ module DiscourseAi
              end
          end
-          # 4. Try block matching as last resort
+          # 5. Try block matching as last resort
          if match_positions.empty?
            if block_matches = find_block_matches(content, search)
              return replace_blocks(content, block_matches, replace)
@ -68,6 +80,27 @@ module DiscourseAi
        private
        def non_contiguous_match_range(lines, search_lines)
          first_idx = nil
          last_idx = nil
          search_index = 0
          lines.each_with_index do |line, idx|
            if search_lines[search_index].strip == "..."
              search_index += 1
              break if search_lines[search_index].nil?
            end
            if line.strip == search_lines[search_index].strip
              first_idx ||= idx
              last_idx = idx
              search_index += 1
              return first_idx, last_idx if search_index == search_lines.length
            end
          end
          nil
        end
        def find_matches(lines, search_lines)
          matches = []
          max_index = lines.length - search_lines.length
--- a/spec/fabricators/ai_artifact_fabricator.rb
+++ b/spec/fabricators/ai_artifact_fabricator.rb
@ -0,0 +1,19 @@
 # frozen_string_literal: true
 Fabricator(:ai_artifact) do
  user
  post
  name { sequence(:name) { |i| "artifact_#{i}" } }
  html { "<div>Test Content</div>" }
  css { ".test { color: blue; }" }
  js { "console.log('test');" }
  metadata { { public: false } }
 end
 Fabricator(:ai_artifact_version) do
  ai_artifact
  version_number { sequence(:version_number) { |i| i } }
  html { "<div>Version Content</div>" }
  css { ".version { color: red; }" }
  js { "console.log('version');" }
  change_description { "Test change" }
 end
--- a/spec/lib/modules/ai_bot/artifact_update_strategies/diff_spec.rb
+++ b/spec/lib/modules/ai_bot/artifact_update_strategies/diff_spec.rb
@ -0,0 +1,214 @@
 # frozen_string_literal: true
 RSpec.describe DiscourseAi::AiBot::ArtifactUpdateStrategies::Diff do
  fab!(:user)
  fab!(:post)
  fab!(:artifact) { Fabricate(:ai_artifact) }
  fab!(:llm_model)
  let(:llm) { llm_model.to_llm }
  let(:instructions) { "Update the button color to red" }
  let(:strategy) do
    described_class.new(
      llm: llm,
      post: post,
      user: user,
      artifact: artifact,
      artifact_version: nil,
      instructions: instructions,
    )
  end
  describe "#apply" do
    it "processes simple search/replace blocks" do
      original_css = ".button { color: blue; }"
      artifact.update!(css: original_css)
      response = <<~RESPONSE
        [CSS]
        <<<<<<< SEARCH
        .button { color: blue; }
        =======
        .button { color: red; }
        >>>>>>> REPLACE
        [/CSS]
      RESPONSE
      DiscourseAi::Completions::Llm.with_prepared_responses([response]) { strategy.apply }
      expect(artifact.versions.last.css).to eq(".button { color: red; }")
    end
    it "handles multiple search/replace blocks in the same section" do
      original_css = <<~CSS
        .button { color: blue; }
        .text { font-size: 12px; }
      CSS
      artifact.update!(css: original_css)
      response = <<~RESPONSE
        [CSS]
        <<<<<<< SEARCH
        .button { color: blue; }
        =======
        .button { color: red; }
        >>>>>>> REPLACE
        <<<<<<< SEARCH
        .text { font-size: 12px; }
        =======
        .text { font-size: 16px; }
        >>>>>>> REPLACE
        [/CSS]
      RESPONSE
      DiscourseAi::Completions::Llm.with_prepared_responses([response]) { strategy.apply }
      expected = <<~CSS.strip
        .button { color: red; }
        .text { font-size: 16px; }
      CSS
      expect(artifact.versions.last.css.strip).to eq(expected.strip)
    end
    it "handles non-contiguous search/replace using ..." do
      original_css = <<~CSS
        body {
          color: red;
        }
        .button {
          color: blue;
        }
        .alert {
          background-color: green;
        }
      CSS
      artifact.update!(css: original_css)
      response = <<~RESPONSE
        [CSS]
        <<<<<<< SEARCH
        body {
        ...
        background-color: green;
        }
        =======
        body {
          color: red;
        }
        >>>>>>> REPLACE
        [/CSS]
      RESPONSE
      DiscourseAi::Completions::Llm.with_prepared_responses([response]) { strategy.apply }
      expect(artifact.versions.last.css).to eq("body {\n  color: red;\n}")
    end
    it "tracks failed searches" do
      original_css = ".button { color: blue; }"
      artifact.update!(css: original_css)
      response = <<~RESPONSE
        [CSS]
        <<<<<<< SEARCH
        .button { color: green; }
        =======
        .button { color: red; }
        >>>>>>> REPLACE
        [/CSS]
      RESPONSE
      DiscourseAi::Completions::Llm.with_prepared_responses([response]) { strategy.apply }
      expect(strategy.failed_searches).to contain_exactly(
        { section: :css, search: ".button { color: green; }" },
      )
      expect(artifact.versions.last.css).to eq(original_css)
    end
    it "handles complete section replacements" do
      original_html = "<div>old content</div>"
      artifact.update!(html: original_html)
      response = <<~RESPONSE
        [HTML]
        <div>new content</div>
        [/HTML]
      RESPONSE
      DiscourseAi::Completions::Llm.with_prepared_responses([response]) { strategy.apply }
      expect(artifact.versions.last.html.strip).to eq("<div>new content</div>")
    end
    it "ignores empty or 'no changes' sections part 1" do
      original = {
        html: "<div>content</div>",
        css: ".button { color: blue; }",
        js: "console.log('test');",
      }
      artifact.update!(html: original[:html], css: original[:css], js: original[:js])
      response = <<~RESPONSE
        [HTML]
        no changes
        [/HTML]
        [CSS]
        (NO CHANGES)
        [/CSS]
        [JavaScript]
        <<<<<<< SEARCH
        console.log('test');
        =======
        console.log('(no changes)');
        >>>>>>> REPLACE
        [/JavaScript]
      RESPONSE
      DiscourseAi::Completions::Llm.with_prepared_responses([response]) { strategy.apply }
      version = artifact.versions.last
      expect(version.html).to eq(original[:html])
      expect(version.css).to eq(original[:css])
      expect(version.js).to eq("console.log('(no changes)');")
    end
    it "ignores empty or 'no changes' section part 2" do
      original = {
        html: "<div>content</div>",
        css: ".button { color: blue; }",
        js: "console.log('test');",
      }
      artifact.update!(html: original[:html], css: original[:css], js: original[:js])
      response = <<~RESPONSE
        [HTML]
        (no changes)
        [/HTML]
        [CSS]
        [/CSS]
        [JavaScript]
        <<<<<<< SEARCH
        console.log('test');
        =======
        console.log('updated');
        >>>>>>> REPLACE
        [/JavaScript]
      RESPONSE
      DiscourseAi::Completions::Llm.with_prepared_responses([response]) { strategy.apply }
      version = artifact.versions.last
      expect(version.html).to eq(original[:html])
      expect(version.css).to eq(original[:css])
      expect(version.js).to eq("console.log('updated');")
    end
  end
 end
--- a/spec/lib/modules/ai_bot/tools/update_artifact_spec.rb
+++ b/spec/lib/modules/ai_bot/tools/update_artifact_spec.rb
@ -262,7 +262,7 @@ RSpec.describe DiscourseAi::AiBot::Tools::UpdateArtifact do
    [/CSS]
    [JavaScript]
-    nothing to do
+    no changes
    [/JavaScript]
    LLMs like to say nonsense that we can ignore here as well
--- a/spec/lib/utils/diff_utils/simple_diff_spec.rb
+++ b/spec/lib/utils/diff_utils/simple_diff_spec.rb
@ -171,5 +171,33 @@ RSpec.describe DiscourseAi::Utils::DiffUtils::SimpleDiff do
      expect(subject.apply(content, search, replace).strip).to eq(expected.strip)
    end
    it "handles missing lines in search" do
      original = <<~TEXT
        line1
         line2
        line3
        line4
        line5
        line1
        line2
      TEXT
      search = <<~TEXT
        line1
        ...
         line3
        ...
        line1
      TEXT
      replace = ""
      expected = <<~TEXT
        line2
      TEXT
      expect(subject.apply(original, search, replace).strip).to eq(expected.strip)
    end
  end
 end