DEV: improve artifact editing and eval system (#1130)

- Add non-contiguous search/replace support using ... syntax
- Add judge support for evaluating LLM outputs with ratings
- Improve error handling and reporting in eval runner
- Add full section replacement support without search blocks
- Add fabricators and specs for artifact diffing
- Track failed searches to improve debugging
- Add JS syntax validation for artifact versions in eval system
- Update prompt documentation with clear guidelines

* improve eval output

* move error handling

* llm as a judge

* fix spec

* small note on evals
This commit is contained in:
Sam 2025-02-19 15:44:33 +11:00 committed by GitHub
parent 02f0908963
commit 0c9466059c
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
10 changed files with 591 additions and 27 deletions

View File

@ -3,3 +3,26 @@
**Plugin Summary** **Plugin Summary**
For more information, please see: https://meta.discourse.org/t/discourse-ai/259214?u=falco For more information, please see: https://meta.discourse.org/t/discourse-ai/259214?u=falco
### Evals
The directory `evals` contains AI evals for the Discourse AI plugin.
To run them use:
cd evals
./run --help
```
Usage: evals/run [options]
-e, --eval NAME Name of the evaluation to run
--list-models List models
-m, --model NAME Model to evaluate (will eval all models if not specified)
-l, --list List evals
```
To run evals you will need to configure API keys in your environment:
OPENAI_API_KEY=your_openai_api_key
ANTHROPIC_API_KEY=your_anthropic_api_key
GEMINI_API_KEY=your_gemini_api_key

View File

@ -4,6 +4,19 @@ class AiArtifactVersion < ActiveRecord::Base
validates :html, length: { maximum: 65_535 } validates :html, length: { maximum: 65_535 }
validates :css, length: { maximum: 65_535 } validates :css, length: { maximum: 65_535 }
validates :js, length: { maximum: 65_535 } validates :js, length: { maximum: 65_535 }
# used when generating test cases
def write_to(path)
css_path = "#{path}/main.css"
html_path = "#{path}/main.html"
js_path = "#{path}/main.js"
instructions_path = "#{path}/instructions.txt"
File.write(css_path, css)
File.write(html_path, html)
File.write(js_path, js)
File.write(instructions_path, change_description)
end
end end
# == Schema Information # == Schema Information

View File

@ -10,7 +10,17 @@ class DiscourseAi::Evals::Eval
:vision, :vision,
:expected_output, :expected_output,
:expected_output_regex, :expected_output_regex,
:expected_tool_call :expected_tool_call,
:judge
class EvalError < StandardError
attr_reader :context
def initialize(message, context)
super(message)
@context = context
end
end
def initialize(path:) def initialize(path:)
@yaml = YAML.load_file(path).symbolize_keys @yaml = YAML.load_file(path).symbolize_keys
@ -27,10 +37,14 @@ class DiscourseAi::Evals::Eval
Regexp.new(@expected_output_regex, Regexp::MULTILINE) if @expected_output_regex Regexp.new(@expected_output_regex, Regexp::MULTILINE) if @expected_output_regex
@expected_tool_call = @yaml[:expected_tool_call] @expected_tool_call = @yaml[:expected_tool_call]
@expected_tool_call.symbolize_keys! if @expected_tool_call @expected_tool_call.symbolize_keys! if @expected_tool_call
@judge = @yaml[:judge]
@judge.symbolize_keys! if @judge
@args[:path] = File.expand_path(File.join(File.dirname(path), @args[:path])) if @args&.key?( @args.each do |key, value|
:path, if (key.to_s.include?("_path") || key.to_s == "path") && value.is_a?(String)
) @args[key] = File.expand_path(File.join(File.dirname(path), value))
end
end
end end
def run(llm:) def run(llm:)
@ -44,6 +58,8 @@ class DiscourseAi::Evals::Eval
image_to_text(llm, **args) image_to_text(llm, **args)
when "prompt" when "prompt"
prompt_call(llm, **args) prompt_call(llm, **args)
when "edit_artifact"
edit_artifact(llm, **args)
end end
if expected_output if expected_output
@ -53,7 +69,7 @@ class DiscourseAi::Evals::Eval
{ result: :fail, expected_output: expected_output, actual_output: result } { result: :fail, expected_output: expected_output, actual_output: result }
end end
elsif expected_output_regex elsif expected_output_regex
if result.match?(expected_output_regex) if result.to_s.match?(expected_output_regex)
{ result: :pass } { result: :pass }
else else
{ result: :fail, expected_output: expected_output_regex, actual_output: result } { result: :fail, expected_output: expected_output_regex, actual_output: result }
@ -71,9 +87,13 @@ class DiscourseAi::Evals::Eval
else else
{ result: :pass } { result: :pass }
end end
elsif judge
judge_result(result)
else else
{ result: :unknown, actual_output: result } { result: :pass }
end end
rescue EvalError => e
{ result: :fail, message: e.message, context: e.context }
end end
def print def print
@ -96,14 +116,68 @@ class DiscourseAi::Evals::Eval
private private
def helper(llm, input:, name:) def judge_result(result)
prompt = judge[:prompt].dup
prompt.sub!("{{output}}", result)
prompt.sub!("{{input}}", args[:input])
prompt += <<~SUFFIX
Reply with a rating from 1 to 10, where 10 is perfect and 1 is terrible.
example output:
[RATING]10[/RATING] perfect output
example output:
[RATING]5[/RATING]
the following failed to preserve... etc...
SUFFIX
judge_llm = DiscourseAi::Evals::Llm.choose(judge[:llm]).first
DiscourseAi::Completions::Prompt.new(
"You are an expert judge tasked at testing LLM outputs.",
messages: [{ type: :user, content: prompt }],
)
result = judge_llm.llm_model.to_llm.generate(prompt, user: Discourse.system_user)
if rating = result.match(%r{\[RATING\](\d+)\[/RATING\]})
rating = rating[1].to_i
end
if rating.to_i >= judge[:pass_rating]
{ result: :pass }
else
{
result: :fail,
message: "LLM Rating below threshold, it was #{rating}, expecting #{judge[:pass_rating]}",
context: result,
}
end
end
def helper(llm, input:, name:, locale: nil)
completion_prompt = CompletionPrompt.find_by(name: name) completion_prompt = CompletionPrompt.find_by(name: name)
helper = DiscourseAi::AiHelper::Assistant.new(helper_llm: llm.llm_proxy) helper = DiscourseAi::AiHelper::Assistant.new(helper_llm: llm.llm_proxy)
user = Discourse.system_user
if locale
user = User.new
class << user
attr_accessor :effective_locale
end
user.effective_locale = locale
user.admin = true
end
result = result =
helper.generate_and_send_prompt( helper.generate_and_send_prompt(
completion_prompt, completion_prompt,
input, input,
current_user = Discourse.system_user, current_user = user,
_force_default_locale = false, _force_default_locale = false,
) )
@ -169,4 +243,73 @@ class DiscourseAi::Evals::Eval
end end
result result
end end
def edit_artifact(llm, css_path:, js_path:, html_path:, instructions_path:)
css = File.read(css_path)
js = File.read(js_path)
html = File.read(html_path)
instructions = File.read(instructions_path)
artifact =
AiArtifact.create!(
css: css,
js: js,
html: html,
user_id: Discourse.system_user.id,
post_id: 1,
name: "eval artifact",
)
post = Post.new(topic_id: 1, id: 1)
diff =
DiscourseAi::AiBot::ArtifactUpdateStrategies::Diff.new(
llm: llm.llm_model.to_llm,
post: post,
user: Discourse.system_user,
artifact: artifact,
artifact_version: nil,
instructions: instructions,
)
diff.apply
if diff.failed_searches.present?
puts "Eval Errors encountered"
p diff.failed_searches
raise EvalError.new("Failed to apply all changes", diff.failed_searches)
end
version = artifact.versions.last
raise EvalError.new("Invalid JS", version.js) if !valid_javascript?(version.js)
output = { css: version.css, js: version.js, html: version.html }
artifact.destroy
output
end
def valid_javascript?(str)
require "open3"
# Create a temporary file with the JavaScript code
Tempfile.create(%w[test .js]) do |f|
f.write(str)
f.flush
File.write("/tmp/test.js", str)
begin
Discourse::Utils.execute_command(
"node",
"--check",
f.path,
failure_message: "Invalid JavaScript syntax",
timeout: 30, # reasonable timeout in seconds
)
true
rescue Discourse::Utils::CommandError
false
end
end
rescue StandardError
false
end
end end

View File

@ -155,9 +155,18 @@ class DiscourseAi::Evals::Runner
if result[:result] == :fail if result[:result] == :fail
puts "Failed 🔴" puts "Failed 🔴"
puts "Error: #{result[:message]}" if result[:message]
# this is deliberate, it creates a lot of noise, but sometimes for debugging it's useful
#puts "Context: #{result[:context].to_s[0..2000]}" if result[:context]
if result[:expected_output] && result[:actual_output]
puts "---- Expected ----\n#{result[:expected_output]}" puts "---- Expected ----\n#{result[:expected_output]}"
puts "---- Actual ----\n#{result[:actual_output]}" puts "---- Actual ----\n#{result[:actual_output]}"
end
logger.error("Evaluation failed with LLM: #{llm.name}") logger.error("Evaluation failed with LLM: #{llm.name}")
logger.error("Error: #{result[:message]}") if result[:message]
logger.error("Expected: #{result[:expected_output]}") if result[:expected_output]
logger.error("Actual: #{result[:actual_output]}") if result[:actual_output]
logger.error("Context: #{result[:context]}") if result[:context]
elsif result[:result] == :pass elsif result[:result] == :pass
puts "Passed 🟢" puts "Passed 🟢"
logger.info("Evaluation passed with LLM: #{llm.name}") logger.info("Evaluation passed with LLM: #{llm.name}")

View File

@ -3,8 +3,15 @@ module DiscourseAi
module AiBot module AiBot
module ArtifactUpdateStrategies module ArtifactUpdateStrategies
class Diff < Base class Diff < Base
attr_reader :failed_searches
private private
def initialize(**kwargs)
super
@failed_searches = []
end
def build_prompt def build_prompt
DiscourseAi::Completions::Prompt.new( DiscourseAi::Completions::Prompt.new(
system_prompt, system_prompt,
@ -51,15 +58,21 @@ module DiscourseAi
content = source.public_send(section == :javascript ? :js : section) content = source.public_send(section == :javascript ? :js : section)
blocks.each do |block| blocks.each do |block|
begin begin
if !block[:search]
content = block[:replace]
else
content = content =
DiscourseAi::Utils::DiffUtils::SimpleDiff.apply( DiscourseAi::Utils::DiffUtils::SimpleDiff.apply(
content, content,
block[:search], block[:search],
block[:replace], block[:replace],
) )
end
rescue DiscourseAi::Utils::DiffUtils::SimpleDiff::NoMatchError rescue DiscourseAi::Utils::DiffUtils::SimpleDiff::NoMatchError
@failed_searches << { section: section, search: block[:search] }
# TODO, we may need to inform caller here, LLM made a mistake which it # TODO, we may need to inform caller here, LLM made a mistake which it
# should correct # should correct
puts "Failed to find search: #{block[:search]}"
end end
end end
updated_content[section == :javascript ? :js : section] = content updated_content[section == :javascript ? :js : section] = content
@ -76,7 +89,8 @@ module DiscourseAi
private private
def extract_search_replace_blocks(content) def extract_search_replace_blocks(content)
return nil if content.blank? return nil if content.blank? || content.to_s.strip.downcase.match?(/^\(?no changes?\)?$/m)
return [{ replace: content }] if !content.match?(/<<+\s*SEARCH/)
blocks = [] blocks = []
remaining = content remaining = content
@ -98,29 +112,35 @@ module DiscourseAi
1. Use EXACTLY this format for changes: 1. Use EXACTLY this format for changes:
<<<<<<< SEARCH <<<<<<< SEARCH
(exact code to find) (first line of code to replace)
(other lines of code to avoid ambiguity)
(last line of code to replace)
======= =======
(replacement code) (replacement code)
>>>>>>> REPLACE >>>>>>> REPLACE
2. DO NOT modify the markers or add spaces around them 2. DO NOT modify the markers or add spaces around them
3. DO NOT add explanations or comments within sections 3. DO NOT add explanations or comments within sections
4. ONLY include [HTML], [CSS], and [JavaScript] sections if they have changes 4. ONLY include [HTML], [CSS], and [JavaScript] sections if they have changes
5. Ensure search text matches EXACTLY - partial matches will fail 5. HTML should not include <html>, <head>, or <body> tags, it is injected into a template
6. Keep changes minimal and focused 6. When specifying a SEARCH block, ALWAYS keep it 8 lines or less, you will be interrupted and a retry will be required if you exceed this limit
7. HTML should not include <html>, <head>, or <body> tags, it is injected into a template 7. NEVER EVER ask followup questions, ALL changes must be performed in a single response, you are consumed via an API, there is no opportunity for humans in the loop
8. When performing a non-contiguous search, ALWAYS use ... to denote the skipped lines
9. Be mindful that ... non-contiguous search is not greedy, the following line will only match the first occurrence of the search block
10. Never mix a full section replacement with a search/replace block in the same section
11. ALWAYS skip sections you to not want to change, do not include them in the response
JavaScript libraries must be sourced from the following CDNs, otherwise CSP will reject it: JavaScript libraries must be sourced from the following CDNs, otherwise CSP will reject it:
#{AiArtifact::ALLOWED_CDN_SOURCES.join("\n")} #{AiArtifact::ALLOWED_CDN_SOURCES.join("\n")}
Reply Format: Reply Format:
[HTML] [HTML]
(changes or empty if no changes) (changes or empty if no changes or entire HTML)
[/HTML] [/HTML]
[CSS] [CSS]
(changes or empty if no changes) (changes or empty if no changes or entire CSS)
[/CSS] [/CSS]
[JavaScript] [JavaScript]
(changes or empty if no changes) (changes or empty if no changes or entire JavaScript)
[/JavaScript] [/JavaScript]
Example - Multiple changes in one file: Example - Multiple changes in one file:
@ -152,6 +172,68 @@ module DiscourseAi
.text { font-size: 16px; } .text { font-size: 16px; }
>>>>>>> REPLACE >>>>>>> REPLACE
[/CSS] [/CSS]
Example - Non contiguous search in CSS (replace most CSS with new CSS)
Original CSS:
[CSS]
body {
color: red;
}
.button {
color: blue;
}
.alert {
background-color: green;
}
.alert2 {
background-color: green;
}
[/CSS]
[CSS]
<<<<<<< SEARCH
body {
...
background-color: green;
}
=======
body {
color: red;
}
>>>>>>> REPLACE
RESULT:
[CSS]
body {
color: red;
}
.alert2 {
background-color: green;
}
[/CSS]
Example - full HTML replacement:
[HTML]
<div>something old</div>
<div>another somethin old</div>
[/HTML]
output:
[HTML]
<div>something new</div>
[/HTML]
result:
[HTML]
<div>something new</div>
[/HTML]
PROMPT PROMPT
end end

View File

@ -26,6 +26,8 @@ module DiscourseAi
lines = content.split("\n") lines = content.split("\n")
search_lines = search.split("\n") search_lines = search.split("\n")
### TODO implement me
# 1. Try exact matching # 1. Try exact matching
match_positions = match_positions =
find_matches(lines, search_lines) { |line, search_line| line == search_line } find_matches(lines, search_lines) { |line, search_line| line == search_line }
@ -38,7 +40,17 @@ module DiscourseAi
end end
end end
# 3. Try fuzzy matching # 3. Try non-contiguous line based stripped matching
if match_positions.empty?
if range = non_contiguous_match_range(lines, search_lines)
first_match, last_match = range
lines.slice!(first_match, last_match - first_match + 1)
lines.insert(first_match, *replace.split("\n"))
return lines.join("\n")
end
end
# 4. Try fuzzy matching
if match_positions.empty? if match_positions.empty?
match_positions = match_positions =
find_matches(lines, search_lines) do |line, search_line| find_matches(lines, search_lines) do |line, search_line|
@ -46,7 +58,7 @@ module DiscourseAi
end end
end end
# 4. Try block matching as last resort # 5. Try block matching as last resort
if match_positions.empty? if match_positions.empty?
if block_matches = find_block_matches(content, search) if block_matches = find_block_matches(content, search)
return replace_blocks(content, block_matches, replace) return replace_blocks(content, block_matches, replace)
@ -68,6 +80,27 @@ module DiscourseAi
private private
def non_contiguous_match_range(lines, search_lines)
first_idx = nil
last_idx = nil
search_index = 0
lines.each_with_index do |line, idx|
if search_lines[search_index].strip == "..."
search_index += 1
break if search_lines[search_index].nil?
end
if line.strip == search_lines[search_index].strip
first_idx ||= idx
last_idx = idx
search_index += 1
return first_idx, last_idx if search_index == search_lines.length
end
end
nil
end
def find_matches(lines, search_lines) def find_matches(lines, search_lines)
matches = [] matches = []
max_index = lines.length - search_lines.length max_index = lines.length - search_lines.length

View File

@ -0,0 +1,19 @@
# frozen_string_literal: true
Fabricator(:ai_artifact) do
user
post
name { sequence(:name) { |i| "artifact_#{i}" } }
html { "<div>Test Content</div>" }
css { ".test { color: blue; }" }
js { "console.log('test');" }
metadata { { public: false } }
end
Fabricator(:ai_artifact_version) do
ai_artifact
version_number { sequence(:version_number) { |i| i } }
html { "<div>Version Content</div>" }
css { ".version { color: red; }" }
js { "console.log('version');" }
change_description { "Test change" }
end

View File

@ -0,0 +1,214 @@
# frozen_string_literal: true
RSpec.describe DiscourseAi::AiBot::ArtifactUpdateStrategies::Diff do
fab!(:user)
fab!(:post)
fab!(:artifact) { Fabricate(:ai_artifact) }
fab!(:llm_model)
let(:llm) { llm_model.to_llm }
let(:instructions) { "Update the button color to red" }
let(:strategy) do
described_class.new(
llm: llm,
post: post,
user: user,
artifact: artifact,
artifact_version: nil,
instructions: instructions,
)
end
describe "#apply" do
it "processes simple search/replace blocks" do
original_css = ".button { color: blue; }"
artifact.update!(css: original_css)
response = <<~RESPONSE
[CSS]
<<<<<<< SEARCH
.button { color: blue; }
=======
.button { color: red; }
>>>>>>> REPLACE
[/CSS]
RESPONSE
DiscourseAi::Completions::Llm.with_prepared_responses([response]) { strategy.apply }
expect(artifact.versions.last.css).to eq(".button { color: red; }")
end
it "handles multiple search/replace blocks in the same section" do
original_css = <<~CSS
.button { color: blue; }
.text { font-size: 12px; }
CSS
artifact.update!(css: original_css)
response = <<~RESPONSE
[CSS]
<<<<<<< SEARCH
.button { color: blue; }
=======
.button { color: red; }
>>>>>>> REPLACE
<<<<<<< SEARCH
.text { font-size: 12px; }
=======
.text { font-size: 16px; }
>>>>>>> REPLACE
[/CSS]
RESPONSE
DiscourseAi::Completions::Llm.with_prepared_responses([response]) { strategy.apply }
expected = <<~CSS.strip
.button { color: red; }
.text { font-size: 16px; }
CSS
expect(artifact.versions.last.css.strip).to eq(expected.strip)
end
it "handles non-contiguous search/replace using ..." do
original_css = <<~CSS
body {
color: red;
}
.button {
color: blue;
}
.alert {
background-color: green;
}
CSS
artifact.update!(css: original_css)
response = <<~RESPONSE
[CSS]
<<<<<<< SEARCH
body {
...
background-color: green;
}
=======
body {
color: red;
}
>>>>>>> REPLACE
[/CSS]
RESPONSE
DiscourseAi::Completions::Llm.with_prepared_responses([response]) { strategy.apply }
expect(artifact.versions.last.css).to eq("body {\n color: red;\n}")
end
it "tracks failed searches" do
original_css = ".button { color: blue; }"
artifact.update!(css: original_css)
response = <<~RESPONSE
[CSS]
<<<<<<< SEARCH
.button { color: green; }
=======
.button { color: red; }
>>>>>>> REPLACE
[/CSS]
RESPONSE
DiscourseAi::Completions::Llm.with_prepared_responses([response]) { strategy.apply }
expect(strategy.failed_searches).to contain_exactly(
{ section: :css, search: ".button { color: green; }" },
)
expect(artifact.versions.last.css).to eq(original_css)
end
it "handles complete section replacements" do
original_html = "<div>old content</div>"
artifact.update!(html: original_html)
response = <<~RESPONSE
[HTML]
<div>new content</div>
[/HTML]
RESPONSE
DiscourseAi::Completions::Llm.with_prepared_responses([response]) { strategy.apply }
expect(artifact.versions.last.html.strip).to eq("<div>new content</div>")
end
it "ignores empty or 'no changes' sections part 1" do
original = {
html: "<div>content</div>",
css: ".button { color: blue; }",
js: "console.log('test');",
}
artifact.update!(html: original[:html], css: original[:css], js: original[:js])
response = <<~RESPONSE
[HTML]
no changes
[/HTML]
[CSS]
(NO CHANGES)
[/CSS]
[JavaScript]
<<<<<<< SEARCH
console.log('test');
=======
console.log('(no changes)');
>>>>>>> REPLACE
[/JavaScript]
RESPONSE
DiscourseAi::Completions::Llm.with_prepared_responses([response]) { strategy.apply }
version = artifact.versions.last
expect(version.html).to eq(original[:html])
expect(version.css).to eq(original[:css])
expect(version.js).to eq("console.log('(no changes)');")
end
it "ignores empty or 'no changes' section part 2" do
original = {
html: "<div>content</div>",
css: ".button { color: blue; }",
js: "console.log('test');",
}
artifact.update!(html: original[:html], css: original[:css], js: original[:js])
response = <<~RESPONSE
[HTML]
(no changes)
[/HTML]
[CSS]
[/CSS]
[JavaScript]
<<<<<<< SEARCH
console.log('test');
=======
console.log('updated');
>>>>>>> REPLACE
[/JavaScript]
RESPONSE
DiscourseAi::Completions::Llm.with_prepared_responses([response]) { strategy.apply }
version = artifact.versions.last
expect(version.html).to eq(original[:html])
expect(version.css).to eq(original[:css])
expect(version.js).to eq("console.log('updated');")
end
end
end

View File

@ -262,7 +262,7 @@ RSpec.describe DiscourseAi::AiBot::Tools::UpdateArtifact do
[/CSS] [/CSS]
[JavaScript] [JavaScript]
nothing to do no changes
[/JavaScript] [/JavaScript]
LLMs like to say nonsense that we can ignore here as well LLMs like to say nonsense that we can ignore here as well

View File

@ -171,5 +171,33 @@ RSpec.describe DiscourseAi::Utils::DiffUtils::SimpleDiff do
expect(subject.apply(content, search, replace).strip).to eq(expected.strip) expect(subject.apply(content, search, replace).strip).to eq(expected.strip)
end end
it "handles missing lines in search" do
original = <<~TEXT
line1
line2
line3
line4
line5
line1
line2
TEXT
search = <<~TEXT
line1
...
line3
...
line1
TEXT
replace = ""
expected = <<~TEXT
line2
TEXT
expect(subject.apply(original, search, replace).strip).to eq(expected.strip)
end
end end
end end