DEV: improve artifact editing and eval system (#1130)

- Add non-contiguous search/replace support using ... syntax - Add judge support for evaluating LLM outputs with ratings - Improve error handling and reporting in eval runner - Add full section replacement support without search blocks - Add fabricators and specs for artifact diffing - Track failed searches to improve debugging - Add JS syntax validation for artifact versions in eval system - Update prompt documentation with clear guidelines * improve eval output * move error handling * llm as a judge * fix spec * small note on evals
2025-06-25 17:12:16 +00:00 · 2025-02-19 15:44:33 +11:00 · 2025-02-19 15:44:33 +11:00 · 0c9466059c
commit 0c9466059c
parent 02f0908963
10 changed files with 591 additions and 27 deletions
--- a/README.md
+++ b/README.md
@ -3,3 +3,26 @@
 **Plugin Summary**

 For more information, please see: https://meta.discourse.org/t/discourse-ai/259214?u=falco
+
+### Evals
+
+The directory `evals` contains AI evals for the Discourse AI plugin.
+
+To run them use: 
+
+cd evals
+./run --help
+
+```
+Usage: evals/run [options]
+    -e, --eval NAME                  Name of the evaluation to run
+        --list-models                List models
+    -m, --model NAME                 Model to evaluate (will eval all models if not specified)
+    -l, --list                       List evals
+```
+
+To run evals you will need to configure API keys in your environment:
+
+OPENAI_API_KEY=your_openai_api_key
+ANTHROPIC_API_KEY=your_anthropic_api_key
+GEMINI_API_KEY=your_gemini_api_key
--- a/app/models/ai_artifact_version.rb
+++ b/app/models/ai_artifact_version.rb
@ -4,6 +4,19 @@ class AiArtifactVersion < ActiveRecord::Base
  validates :html, length: { maximum: 65_535 }
  validates :css, length: { maximum: 65_535 }
  validates :js, length: { maximum: 65_535 }
+
+  # used when generating test cases
+  def write_to(path)
+    css_path = "#{path}/main.css"
+    html_path = "#{path}/main.html"
+    js_path = "#{path}/main.js"
+    instructions_path = "#{path}/instructions.txt"
+
+    File.write(css_path, css)
+    File.write(html_path, html)
+    File.write(js_path, js)
+    File.write(instructions_path, change_description)
+  end
 end

 # == Schema Information
--- a/evals/lib/eval.rb
+++ b/evals/lib/eval.rb
@ -10,7 +10,17 @@ class DiscourseAi::Evals::Eval
              :vision,
              :expected_output,
              :expected_output_regex,
-              :expected_tool_call
+              :expected_tool_call,
+              :judge
+
+  class EvalError < StandardError
+    attr_reader :context
+
+    def initialize(message, context)
+      super(message)
+      @context = context
+    end
+  end

  def initialize(path:)
    @yaml = YAML.load_file(path).symbolize_keys
@ -27,10 +37,14 @@ class DiscourseAi::Evals::Eval
      Regexp.new(@expected_output_regex, Regexp::MULTILINE) if @expected_output_regex
    @expected_tool_call = @yaml[:expected_tool_call]
    @expected_tool_call.symbolize_keys! if @expected_tool_call
+    @judge = @yaml[:judge]
+    @judge.symbolize_keys! if @judge

-    @args[:path] = File.expand_path(File.join(File.dirname(path), @args[:path])) if @args&.key?(
-      :path,
-    )
+    @args.each do |key, value|
+      if (key.to_s.include?("_path") || key.to_s == "path") && value.is_a?(String)
+        @args[key] = File.expand_path(File.join(File.dirname(path), value))
+      end
+    end
  end

  def run(llm:)
@ -44,6 +58,8 @@ class DiscourseAi::Evals::Eval
        image_to_text(llm, **args)
      when "prompt"
        prompt_call(llm, **args)
+      when "edit_artifact"
+        edit_artifact(llm, **args)
      end

    if expected_output
@ -53,7 +69,7 @@ class DiscourseAi::Evals::Eval
        { result: :fail, expected_output: expected_output, actual_output: result }
      end
    elsif expected_output_regex
-      if result.match?(expected_output_regex)
+      if result.to_s.match?(expected_output_regex)
        { result: :pass }
      else
        { result: :fail, expected_output: expected_output_regex, actual_output: result }
@ -71,9 +87,13 @@ class DiscourseAi::Evals::Eval
      else
        { result: :pass }
      end
+    elsif judge
+      judge_result(result)
    else
-      { result: :unknown, actual_output: result }
+      { result: :pass }
    end
+  rescue EvalError => e
+    { result: :fail, message: e.message, context: e.context }
  end

  def print
@ -96,14 +116,68 @@ class DiscourseAi::Evals::Eval

  private

-  def helper(llm, input:, name:)
+  def judge_result(result)
+    prompt = judge[:prompt].dup
+    prompt.sub!("{{output}}", result)
+    prompt.sub!("{{input}}", args[:input])
+
+    prompt += <<~SUFFIX
+
+      Reply with a rating from 1 to 10, where 10 is perfect and 1 is terrible.
+
+      example output:
+
+      [RATING]10[/RATING] perfect output
+
+      example output:
+
+      [RATING]5[/RATING]
+
+      the following failed to preserve... etc...
+    SUFFIX
+
+    judge_llm = DiscourseAi::Evals::Llm.choose(judge[:llm]).first
+
+    DiscourseAi::Completions::Prompt.new(
+      "You are an expert judge tasked at testing LLM outputs.",
+      messages: [{ type: :user, content: prompt }],
+    )
+
+    result = judge_llm.llm_model.to_llm.generate(prompt, user: Discourse.system_user)
+
+    if rating = result.match(%r{\[RATING\](\d+)\[/RATING\]})
+      rating = rating[1].to_i
+    end
+
+    if rating.to_i >= judge[:pass_rating]
+      { result: :pass }
+    else
+      {
+        result: :fail,
+        message: "LLM Rating below threshold, it was #{rating}, expecting #{judge[:pass_rating]}",
+        context: result,
+      }
+    end
+  end
+
+  def helper(llm, input:, name:, locale: nil)
    completion_prompt = CompletionPrompt.find_by(name: name)
    helper = DiscourseAi::AiHelper::Assistant.new(helper_llm: llm.llm_proxy)
+    user = Discourse.system_user
+    if locale
+      user = User.new
+      class << user
+        attr_accessor :effective_locale
+      end
+
+      user.effective_locale = locale
+      user.admin = true
+    end
    result =
      helper.generate_and_send_prompt(
        completion_prompt,
        input,
-        current_user = Discourse.system_user,
+        current_user = user,
        _force_default_locale = false,
      )

@ -169,4 +243,73 @@ class DiscourseAi::Evals::Eval
    end
    result
  end
+
+  def edit_artifact(llm, css_path:, js_path:, html_path:, instructions_path:)
+    css = File.read(css_path)
+    js = File.read(js_path)
+    html = File.read(html_path)
+    instructions = File.read(instructions_path)
+    artifact =
+      AiArtifact.create!(
+        css: css,
+        js: js,
+        html: html,
+        user_id: Discourse.system_user.id,
+        post_id: 1,
+        name: "eval artifact",
+      )
+
+    post = Post.new(topic_id: 1, id: 1)
+    diff =
+      DiscourseAi::AiBot::ArtifactUpdateStrategies::Diff.new(
+        llm: llm.llm_model.to_llm,
+        post: post,
+        user: Discourse.system_user,
+        artifact: artifact,
+        artifact_version: nil,
+        instructions: instructions,
+      )
+    diff.apply
+
+    if diff.failed_searches.present?
+      puts "Eval Errors encountered"
+      p diff.failed_searches
+      raise EvalError.new("Failed to apply all changes", diff.failed_searches)
+    end
+
+    version = artifact.versions.last
+    raise EvalError.new("Invalid JS", version.js) if !valid_javascript?(version.js)
+
+    output = { css: version.css, js: version.js, html: version.html }
+
+    artifact.destroy
+    output
+  end
+
+  def valid_javascript?(str)
+    require "open3"
+
+    # Create a temporary file with the JavaScript code
+    Tempfile.create(%w[test .js]) do |f|
+      f.write(str)
+      f.flush
+
+      File.write("/tmp/test.js", str)
+
+      begin
+        Discourse::Utils.execute_command(
+          "node",
+          "--check",
+          f.path,
+          failure_message: "Invalid JavaScript syntax",
+          timeout: 30, # reasonable timeout in seconds
+        )
+        true
+      rescue Discourse::Utils::CommandError
+        false
+      end
+    end
+  rescue StandardError
+    false
+  end
 end
--- a/evals/lib/runner.rb
+++ b/evals/lib/runner.rb
@ -155,9 +155,18 @@ class DiscourseAi::Evals::Runner

          if result[:result] == :fail
            puts "Failed 🔴"
-            puts "---- Expected ----\n#{result[:expected_output]}"
-            puts "---- Actual ----\n#{result[:actual_output]}"
+            puts "Error: #{result[:message]}" if result[:message]
+            # this is deliberate, it creates a lot of noise, but sometimes for debugging it's useful
+            #puts "Context: #{result[:context].to_s[0..2000]}" if result[:context]
+            if result[:expected_output] && result[:actual_output]
+              puts "---- Expected ----\n#{result[:expected_output]}"
+              puts "---- Actual ----\n#{result[:actual_output]}"
+            end
            logger.error("Evaluation failed with LLM: #{llm.name}")
+            logger.error("Error: #{result[:message]}") if result[:message]
+            logger.error("Expected: #{result[:expected_output]}") if result[:expected_output]
+            logger.error("Actual: #{result[:actual_output]}") if result[:actual_output]
+            logger.error("Context: #{result[:context]}") if result[:context]
          elsif result[:result] == :pass
            puts "Passed 🟢"
            logger.info("Evaluation passed with LLM: #{llm.name}")
--- a/lib/ai_bot/artifact_update_strategies/diff.rb
+++ b/lib/ai_bot/artifact_update_strategies/diff.rb
@ -3,8 +3,15 @@ module DiscourseAi
  module AiBot
    module ArtifactUpdateStrategies
      class Diff < Base
+        attr_reader :failed_searches
+
        private

+        def initialize(**kwargs)
+          super
+          @failed_searches = []
+        end
+
        def build_prompt
          DiscourseAi::Completions::Prompt.new(
            system_prompt,
@ -51,15 +58,21 @@ module DiscourseAi
            content = source.public_send(section == :javascript ? :js : section)
            blocks.each do |block|
              begin
-                content =
-                  DiscourseAi::Utils::DiffUtils::SimpleDiff.apply(
-                    content,
-                    block[:search],
-                    block[:replace],
-                  )
+                if !block[:search]
+                  content = block[:replace]
+                else
+                  content =
+                    DiscourseAi::Utils::DiffUtils::SimpleDiff.apply(
+                      content,
+                      block[:search],
+                      block[:replace],
+                    )
+                end
              rescue DiscourseAi::Utils::DiffUtils::SimpleDiff::NoMatchError
+                @failed_searches << { section: section, search: block[:search] }
                # TODO, we may need to inform caller here, LLM made a mistake which it
                # should correct
+                puts "Failed to find search: #{block[:search]}"
              end
            end
            updated_content[section == :javascript ? :js : section] = content
@ -76,7 +89,8 @@ module DiscourseAi
        private

        def extract_search_replace_blocks(content)
-          return nil if content.blank?
+          return nil if content.blank? || content.to_s.strip.downcase.match?(/^\(?no changes?\)?$/m)
+          return [{ replace: content }] if !content.match?(/<<+\s*SEARCH/)

          blocks = []
          remaining = content
@ -98,29 +112,35 @@ module DiscourseAi

            1. Use EXACTLY this format for changes:
               <<<<<<< SEARCH
-               (exact code to find)
+               (first line of code to replace)
+               (other lines of code to avoid ambiguity)
+               (last line of code to replace)
               =======
               (replacement code)
               >>>>>>> REPLACE
            2. DO NOT modify the markers or add spaces around them
            3. DO NOT add explanations or comments within sections
            4. ONLY include [HTML], [CSS], and [JavaScript] sections if they have changes
-            5. Ensure search text matches EXACTLY - partial matches will fail
-            6. Keep changes minimal and focused
-            7. HTML should not include <html>, <head>, or <body> tags, it is injected into a template
+            5. HTML should not include <html>, <head>, or <body> tags, it is injected into a template
+            6. When specifying a SEARCH block, ALWAYS keep it 8 lines or less, you will be interrupted and a retry will be required if you exceed this limit
+            7. NEVER EVER ask followup questions, ALL changes must be performed in a single response, you are consumed via an API, there is no opportunity for humans in the loop
+            8. When performing a non-contiguous search, ALWAYS use ... to denote the skipped lines
+            9. Be mindful that ... non-contiguous search is not greedy, the following line will only match the first occurrence of the search block
+            10. Never mix a full section replacement with a search/replace block in the same section
+            11. ALWAYS skip sections you to not want to change, do not include them in the response

            JavaScript libraries must be sourced from the following CDNs, otherwise CSP will reject it:
            #{AiArtifact::ALLOWED_CDN_SOURCES.join("\n")}

            Reply Format:
            [HTML]
-            (changes or empty if no changes)
+            (changes or empty if no changes or entire HTML)
            [/HTML]
            [CSS]
-            (changes or empty if no changes)
+            (changes or empty if no changes or entire CSS)
            [/CSS]
            [JavaScript]
-            (changes or empty if no changes)
+            (changes or empty if no changes or entire JavaScript)
            [/JavaScript]

            Example - Multiple changes in one file:
@ -152,6 +172,68 @@ module DiscourseAi
            .text { font-size: 16px; }
            >>>>>>> REPLACE
            [/CSS]
+
+            Example - Non contiguous search in CSS (replace most CSS with new CSS)
+
+            Original CSS:
+
+            [CSS]
+            body {
+              color: red;
+            }
+            .button {
+              color: blue;
+            }
+            .alert {
+              background-color: green;
+            }
+            .alert2 {
+              background-color: green;
+            }
+            [/CSS]
+
+            [CSS]
+            <<<<<<< SEARCH
+            body {
+            ...
+              background-color: green;
+            }
+            =======
+            body {
+              color: red;
+            }
+            >>>>>>> REPLACE
+
+            RESULT:
+
+            [CSS]
+            body {
+              color: red;
+            }
+            .alert2 {
+              background-color: green;
+            }
+            [/CSS]
+
+            Example - full HTML replacement:
+
+            [HTML]
+            <div>something old</div>
+            <div>another somethin old</div>
+            [/HTML]
+
+            output:
+
+            [HTML]
+            <div>something new</div>
+            [/HTML]
+
+            result:
+            [HTML]
+            <div>something new</div>
+            [/HTML]
+
+
          PROMPT
        end

--- a/lib/utils/diff_utils/simple_diff.rb
+++ b/lib/utils/diff_utils/simple_diff.rb
@ -26,6 +26,8 @@ module DiscourseAi
          lines = content.split("\n")
          search_lines = search.split("\n")

+          ### TODO implement me
+
          # 1. Try exact matching
          match_positions =
            find_matches(lines, search_lines) { |line, search_line| line == search_line }
@ -38,7 +40,17 @@ module DiscourseAi
              end
          end

-          # 3. Try fuzzy matching
+          # 3. Try non-contiguous line based stripped matching
+          if match_positions.empty?
+            if range = non_contiguous_match_range(lines, search_lines)
+              first_match, last_match = range
+              lines.slice!(first_match, last_match - first_match + 1)
+              lines.insert(first_match, *replace.split("\n"))
+              return lines.join("\n")
+            end
+          end
+
+          # 4. Try fuzzy matching
          if match_positions.empty?
            match_positions =
              find_matches(lines, search_lines) do |line, search_line|
@ -46,7 +58,7 @@ module DiscourseAi
              end
          end

-          # 4. Try block matching as last resort
+          # 5. Try block matching as last resort
          if match_positions.empty?
            if block_matches = find_block_matches(content, search)
              return replace_blocks(content, block_matches, replace)
@ -68,6 +80,27 @@ module DiscourseAi

        private

+        def non_contiguous_match_range(lines, search_lines)
+          first_idx = nil
+          last_idx = nil
+          search_index = 0
+
+          lines.each_with_index do |line, idx|
+            if search_lines[search_index].strip == "..."
+              search_index += 1
+              break if search_lines[search_index].nil?
+            end
+            if line.strip == search_lines[search_index].strip
+              first_idx ||= idx
+              last_idx = idx
+              search_index += 1
+              return first_idx, last_idx if search_index == search_lines.length
+            end
+          end
+
+          nil
+        end
+
        def find_matches(lines, search_lines)
          matches = []
          max_index = lines.length - search_lines.length
--- a/spec/fabricators/ai_artifact_fabricator.rb
+++ b/spec/fabricators/ai_artifact_fabricator.rb
@ -0,0 +1,19 @@
+# frozen_string_literal: true
+Fabricator(:ai_artifact) do
+  user
+  post
+  name { sequence(:name) { |i| "artifact_#{i}" } }
+  html { "<div>Test Content</div>" }
+  css { ".test { color: blue; }" }
+  js { "console.log('test');" }
+  metadata { { public: false } }
+end
+
+Fabricator(:ai_artifact_version) do
+  ai_artifact
+  version_number { sequence(:version_number) { |i| i } }
+  html { "<div>Version Content</div>" }
+  css { ".version { color: red; }" }
+  js { "console.log('version');" }
+  change_description { "Test change" }
+end
--- a/spec/lib/modules/ai_bot/artifact_update_strategies/diff_spec.rb
+++ b/spec/lib/modules/ai_bot/artifact_update_strategies/diff_spec.rb
@ -0,0 +1,214 @@
+# frozen_string_literal: true
+
+RSpec.describe DiscourseAi::AiBot::ArtifactUpdateStrategies::Diff do
+  fab!(:user)
+  fab!(:post)
+  fab!(:artifact) { Fabricate(:ai_artifact) }
+  fab!(:llm_model)
+
+  let(:llm) { llm_model.to_llm }
+  let(:instructions) { "Update the button color to red" }
+
+  let(:strategy) do
+    described_class.new(
+      llm: llm,
+      post: post,
+      user: user,
+      artifact: artifact,
+      artifact_version: nil,
+      instructions: instructions,
+    )
+  end
+
+  describe "#apply" do
+    it "processes simple search/replace blocks" do
+      original_css = ".button { color: blue; }"
+      artifact.update!(css: original_css)
+
+      response = <<~RESPONSE
+        [CSS]
+        <<<<<<< SEARCH
+        .button { color: blue; }
+        =======
+        .button { color: red; }
+        >>>>>>> REPLACE
+        [/CSS]
+      RESPONSE
+
+      DiscourseAi::Completions::Llm.with_prepared_responses([response]) { strategy.apply }
+
+      expect(artifact.versions.last.css).to eq(".button { color: red; }")
+    end
+
+    it "handles multiple search/replace blocks in the same section" do
+      original_css = <<~CSS
+        .button { color: blue; }
+        .text { font-size: 12px; }
+      CSS
+
+      artifact.update!(css: original_css)
+
+      response = <<~RESPONSE
+        [CSS]
+        <<<<<<< SEARCH
+        .button { color: blue; }
+        =======
+        .button { color: red; }
+        >>>>>>> REPLACE
+        <<<<<<< SEARCH
+        .text { font-size: 12px; }
+        =======
+        .text { font-size: 16px; }
+        >>>>>>> REPLACE
+        [/CSS]
+      RESPONSE
+
+      DiscourseAi::Completions::Llm.with_prepared_responses([response]) { strategy.apply }
+
+      expected = <<~CSS.strip
+        .button { color: red; }
+        .text { font-size: 16px; }
+      CSS
+
+      expect(artifact.versions.last.css.strip).to eq(expected.strip)
+    end
+
+    it "handles non-contiguous search/replace using ..." do
+      original_css = <<~CSS
+        body {
+          color: red;
+        }
+        .button {
+          color: blue;
+        }
+        .alert {
+          background-color: green;
+        }
+      CSS
+
+      artifact.update!(css: original_css)
+
+      response = <<~RESPONSE
+        [CSS]
+        <<<<<<< SEARCH
+        body {
+        ...
+        background-color: green;
+        }
+        =======
+        body {
+          color: red;
+        }
+        >>>>>>> REPLACE
+        [/CSS]
+      RESPONSE
+
+      DiscourseAi::Completions::Llm.with_prepared_responses([response]) { strategy.apply }
+
+      expect(artifact.versions.last.css).to eq("body {\n  color: red;\n}")
+    end
+
+    it "tracks failed searches" do
+      original_css = ".button { color: blue; }"
+      artifact.update!(css: original_css)
+
+      response = <<~RESPONSE
+        [CSS]
+        <<<<<<< SEARCH
+        .button { color: green; }
+        =======
+        .button { color: red; }
+        >>>>>>> REPLACE
+        [/CSS]
+      RESPONSE
+
+      DiscourseAi::Completions::Llm.with_prepared_responses([response]) { strategy.apply }
+
+      expect(strategy.failed_searches).to contain_exactly(
+        { section: :css, search: ".button { color: green; }" },
+      )
+      expect(artifact.versions.last.css).to eq(original_css)
+    end
+
+    it "handles complete section replacements" do
+      original_html = "<div>old content</div>"
+      artifact.update!(html: original_html)
+
+      response = <<~RESPONSE
+        [HTML]
+        <div>new content</div>
+        [/HTML]
+      RESPONSE
+
+      DiscourseAi::Completions::Llm.with_prepared_responses([response]) { strategy.apply }
+
+      expect(artifact.versions.last.html.strip).to eq("<div>new content</div>")
+    end
+
+    it "ignores empty or 'no changes' sections part 1" do
+      original = {
+        html: "<div>content</div>",
+        css: ".button { color: blue; }",
+        js: "console.log('test');",
+      }
+
+      artifact.update!(html: original[:html], css: original[:css], js: original[:js])
+
+      response = <<~RESPONSE
+        [HTML]
+        no changes
+        [/HTML]
+        [CSS]
+        (NO CHANGES)
+        [/CSS]
+        [JavaScript]
+        <<<<<<< SEARCH
+        console.log('test');
+        =======
+        console.log('(no changes)');
+        >>>>>>> REPLACE
+        [/JavaScript]
+      RESPONSE
+
+      DiscourseAi::Completions::Llm.with_prepared_responses([response]) { strategy.apply }
+
+      version = artifact.versions.last
+      expect(version.html).to eq(original[:html])
+      expect(version.css).to eq(original[:css])
+      expect(version.js).to eq("console.log('(no changes)');")
+    end
+
+    it "ignores empty or 'no changes' section part 2" do
+      original = {
+        html: "<div>content</div>",
+        css: ".button { color: blue; }",
+        js: "console.log('test');",
+      }
+
+      artifact.update!(html: original[:html], css: original[:css], js: original[:js])
+
+      response = <<~RESPONSE
+        [HTML]
+        (no changes)
+        [/HTML]
+        [CSS]
+
+        [/CSS]
+        [JavaScript]
+        <<<<<<< SEARCH
+        console.log('test');
+        =======
+        console.log('updated');
+        >>>>>>> REPLACE
+        [/JavaScript]
+      RESPONSE
+
+      DiscourseAi::Completions::Llm.with_prepared_responses([response]) { strategy.apply }
+
+      version = artifact.versions.last
+      expect(version.html).to eq(original[:html])
+      expect(version.css).to eq(original[:css])
+      expect(version.js).to eq("console.log('updated');")
+    end
+  end
+end
--- a/spec/lib/modules/ai_bot/tools/update_artifact_spec.rb
+++ b/spec/lib/modules/ai_bot/tools/update_artifact_spec.rb
@ -262,7 +262,7 @@ RSpec.describe DiscourseAi::AiBot::Tools::UpdateArtifact do
    [/CSS]

    [JavaScript]
-    nothing to do
+    no changes
    [/JavaScript]

    LLMs like to say nonsense that we can ignore here as well
--- a/spec/lib/utils/diff_utils/simple_diff_spec.rb
+++ b/spec/lib/utils/diff_utils/simple_diff_spec.rb
@ -171,5 +171,33 @@ RSpec.describe DiscourseAi::Utils::DiffUtils::SimpleDiff do

      expect(subject.apply(content, search, replace).strip).to eq(expected.strip)
    end
+
+    it "handles missing lines in search" do
+      original = <<~TEXT
+        line1
+         line2
+        line3
+        line4
+        line5
+        line1
+        line2
+      TEXT
+
+      search = <<~TEXT
+        line1
+        ...
+         line3
+        ...
+        line1
+      TEXT
+
+      replace = ""
+
+      expected = <<~TEXT
+        line2
+      TEXT
+
+      expect(subject.apply(original, search, replace).strip).to eq(expected.strip)
+    end
  end
 end