DEV: improve artifact editing and eval system (#1130)

- Add non-contiguous search/replace support using ... syntax
- Add judge support for evaluating LLM outputs with ratings
- Improve error handling and reporting in eval runner
- Add full section replacement support without search blocks
- Add fabricators and specs for artifact diffing
- Track failed searches to improve debugging
- Add JS syntax validation for artifact versions in eval system
- Update prompt documentation with clear guidelines

* improve eval output

* move error handling

* llm as a judge

* fix spec

* small note on evals
This commit is contained in:
Sam 2025-02-19 15:44:33 +11:00 committed by GitHub
parent 02f0908963
commit 0c9466059c
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
10 changed files with 591 additions and 27 deletions

View File

@ -3,3 +3,26 @@
**Plugin Summary**
For more information, please see: https://meta.discourse.org/t/discourse-ai/259214?u=falco
### Evals
The directory `evals` contains AI evals for the Discourse AI plugin.
To run them use:
cd evals
./run --help
```
Usage: evals/run [options]
-e, --eval NAME Name of the evaluation to run
--list-models List models
-m, --model NAME Model to evaluate (will eval all models if not specified)
-l, --list List evals
```
To run evals you will need to configure API keys in your environment:
OPENAI_API_KEY=your_openai_api_key
ANTHROPIC_API_KEY=your_anthropic_api_key
GEMINI_API_KEY=your_gemini_api_key

View File

@ -4,6 +4,19 @@ class AiArtifactVersion < ActiveRecord::Base
validates :html, length: { maximum: 65_535 }
validates :css, length: { maximum: 65_535 }
validates :js, length: { maximum: 65_535 }
# used when generating test cases
def write_to(path)
css_path = "#{path}/main.css"
html_path = "#{path}/main.html"
js_path = "#{path}/main.js"
instructions_path = "#{path}/instructions.txt"
File.write(css_path, css)
File.write(html_path, html)
File.write(js_path, js)
File.write(instructions_path, change_description)
end
end
# == Schema Information

View File

@ -10,7 +10,17 @@ class DiscourseAi::Evals::Eval
:vision,
:expected_output,
:expected_output_regex,
:expected_tool_call
:expected_tool_call,
:judge
class EvalError < StandardError
attr_reader :context
def initialize(message, context)
super(message)
@context = context
end
end
def initialize(path:)
@yaml = YAML.load_file(path).symbolize_keys
@ -27,10 +37,14 @@ class DiscourseAi::Evals::Eval
Regexp.new(@expected_output_regex, Regexp::MULTILINE) if @expected_output_regex
@expected_tool_call = @yaml[:expected_tool_call]
@expected_tool_call.symbolize_keys! if @expected_tool_call
@judge = @yaml[:judge]
@judge.symbolize_keys! if @judge
@args[:path] = File.expand_path(File.join(File.dirname(path), @args[:path])) if @args&.key?(
:path,
)
@args.each do |key, value|
if (key.to_s.include?("_path") || key.to_s == "path") && value.is_a?(String)
@args[key] = File.expand_path(File.join(File.dirname(path), value))
end
end
end
def run(llm:)
@ -44,6 +58,8 @@ class DiscourseAi::Evals::Eval
image_to_text(llm, **args)
when "prompt"
prompt_call(llm, **args)
when "edit_artifact"
edit_artifact(llm, **args)
end
if expected_output
@ -53,7 +69,7 @@ class DiscourseAi::Evals::Eval
{ result: :fail, expected_output: expected_output, actual_output: result }
end
elsif expected_output_regex
if result.match?(expected_output_regex)
if result.to_s.match?(expected_output_regex)
{ result: :pass }
else
{ result: :fail, expected_output: expected_output_regex, actual_output: result }
@ -71,9 +87,13 @@ class DiscourseAi::Evals::Eval
else
{ result: :pass }
end
elsif judge
judge_result(result)
else
{ result: :unknown, actual_output: result }
{ result: :pass }
end
rescue EvalError => e
{ result: :fail, message: e.message, context: e.context }
end
def print
@ -96,14 +116,68 @@ class DiscourseAi::Evals::Eval
private
def helper(llm, input:, name:)
def judge_result(result)
prompt = judge[:prompt].dup
prompt.sub!("{{output}}", result)
prompt.sub!("{{input}}", args[:input])
prompt += <<~SUFFIX
Reply with a rating from 1 to 10, where 10 is perfect and 1 is terrible.
example output:
[RATING]10[/RATING] perfect output
example output:
[RATING]5[/RATING]
the following failed to preserve... etc...
SUFFIX
judge_llm = DiscourseAi::Evals::Llm.choose(judge[:llm]).first
DiscourseAi::Completions::Prompt.new(
"You are an expert judge tasked at testing LLM outputs.",
messages: [{ type: :user, content: prompt }],
)
result = judge_llm.llm_model.to_llm.generate(prompt, user: Discourse.system_user)
if rating = result.match(%r{\[RATING\](\d+)\[/RATING\]})
rating = rating[1].to_i
end
if rating.to_i >= judge[:pass_rating]
{ result: :pass }
else
{
result: :fail,
message: "LLM Rating below threshold, it was #{rating}, expecting #{judge[:pass_rating]}",
context: result,
}
end
end
def helper(llm, input:, name:, locale: nil)
completion_prompt = CompletionPrompt.find_by(name: name)
helper = DiscourseAi::AiHelper::Assistant.new(helper_llm: llm.llm_proxy)
user = Discourse.system_user
if locale
user = User.new
class << user
attr_accessor :effective_locale
end
user.effective_locale = locale
user.admin = true
end
result =
helper.generate_and_send_prompt(
completion_prompt,
input,
current_user = Discourse.system_user,
current_user = user,
_force_default_locale = false,
)
@ -169,4 +243,73 @@ class DiscourseAi::Evals::Eval
end
result
end
def edit_artifact(llm, css_path:, js_path:, html_path:, instructions_path:)
css = File.read(css_path)
js = File.read(js_path)
html = File.read(html_path)
instructions = File.read(instructions_path)
artifact =
AiArtifact.create!(
css: css,
js: js,
html: html,
user_id: Discourse.system_user.id,
post_id: 1,
name: "eval artifact",
)
post = Post.new(topic_id: 1, id: 1)
diff =
DiscourseAi::AiBot::ArtifactUpdateStrategies::Diff.new(
llm: llm.llm_model.to_llm,
post: post,
user: Discourse.system_user,
artifact: artifact,
artifact_version: nil,
instructions: instructions,
)
diff.apply
if diff.failed_searches.present?
puts "Eval Errors encountered"
p diff.failed_searches
raise EvalError.new("Failed to apply all changes", diff.failed_searches)
end
version = artifact.versions.last
raise EvalError.new("Invalid JS", version.js) if !valid_javascript?(version.js)
output = { css: version.css, js: version.js, html: version.html }
artifact.destroy
output
end
def valid_javascript?(str)
require "open3"
# Create a temporary file with the JavaScript code
Tempfile.create(%w[test .js]) do |f|
f.write(str)
f.flush
File.write("/tmp/test.js", str)
begin
Discourse::Utils.execute_command(
"node",
"--check",
f.path,
failure_message: "Invalid JavaScript syntax",
timeout: 30, # reasonable timeout in seconds
)
true
rescue Discourse::Utils::CommandError
false
end
end
rescue StandardError
false
end
end

View File

@ -155,9 +155,18 @@ class DiscourseAi::Evals::Runner
if result[:result] == :fail
puts "Failed 🔴"
puts "---- Expected ----\n#{result[:expected_output]}"
puts "---- Actual ----\n#{result[:actual_output]}"
puts "Error: #{result[:message]}" if result[:message]
# this is deliberate, it creates a lot of noise, but sometimes for debugging it's useful
#puts "Context: #{result[:context].to_s[0..2000]}" if result[:context]
if result[:expected_output] && result[:actual_output]
puts "---- Expected ----\n#{result[:expected_output]}"
puts "---- Actual ----\n#{result[:actual_output]}"
end
logger.error("Evaluation failed with LLM: #{llm.name}")
logger.error("Error: #{result[:message]}") if result[:message]
logger.error("Expected: #{result[:expected_output]}") if result[:expected_output]
logger.error("Actual: #{result[:actual_output]}") if result[:actual_output]
logger.error("Context: #{result[:context]}") if result[:context]
elsif result[:result] == :pass
puts "Passed 🟢"
logger.info("Evaluation passed with LLM: #{llm.name}")

View File

@ -3,8 +3,15 @@ module DiscourseAi
module AiBot
module ArtifactUpdateStrategies
class Diff < Base
attr_reader :failed_searches
private
def initialize(**kwargs)
super
@failed_searches = []
end
def build_prompt
DiscourseAi::Completions::Prompt.new(
system_prompt,
@ -51,15 +58,21 @@ module DiscourseAi
content = source.public_send(section == :javascript ? :js : section)
blocks.each do |block|
begin
content =
DiscourseAi::Utils::DiffUtils::SimpleDiff.apply(
content,
block[:search],
block[:replace],
)
if !block[:search]
content = block[:replace]
else
content =
DiscourseAi::Utils::DiffUtils::SimpleDiff.apply(
content,
block[:search],
block[:replace],
)
end
rescue DiscourseAi::Utils::DiffUtils::SimpleDiff::NoMatchError
@failed_searches << { section: section, search: block[:search] }
# TODO, we may need to inform caller here, LLM made a mistake which it
# should correct
puts "Failed to find search: #{block[:search]}"
end
end
updated_content[section == :javascript ? :js : section] = content
@ -76,7 +89,8 @@ module DiscourseAi
private
def extract_search_replace_blocks(content)
return nil if content.blank?
return nil if content.blank? || content.to_s.strip.downcase.match?(/^\(?no changes?\)?$/m)
return [{ replace: content }] if !content.match?(/<<+\s*SEARCH/)
blocks = []
remaining = content
@ -98,29 +112,35 @@ module DiscourseAi
1. Use EXACTLY this format for changes:
<<<<<<< SEARCH
(exact code to find)
(first line of code to replace)
(other lines of code to avoid ambiguity)
(last line of code to replace)
=======
(replacement code)
>>>>>>> REPLACE
2. DO NOT modify the markers or add spaces around them
3. DO NOT add explanations or comments within sections
4. ONLY include [HTML], [CSS], and [JavaScript] sections if they have changes
5. Ensure search text matches EXACTLY - partial matches will fail
6. Keep changes minimal and focused
7. HTML should not include <html>, <head>, or <body> tags, it is injected into a template
5. HTML should not include <html>, <head>, or <body> tags, it is injected into a template
6. When specifying a SEARCH block, ALWAYS keep it 8 lines or less, you will be interrupted and a retry will be required if you exceed this limit
7. NEVER EVER ask followup questions, ALL changes must be performed in a single response, you are consumed via an API, there is no opportunity for humans in the loop
8. When performing a non-contiguous search, ALWAYS use ... to denote the skipped lines
9. Be mindful that ... non-contiguous search is not greedy, the following line will only match the first occurrence of the search block
10. Never mix a full section replacement with a search/replace block in the same section
11. ALWAYS skip sections you to not want to change, do not include them in the response
JavaScript libraries must be sourced from the following CDNs, otherwise CSP will reject it:
#{AiArtifact::ALLOWED_CDN_SOURCES.join("\n")}
Reply Format:
[HTML]
(changes or empty if no changes)
(changes or empty if no changes or entire HTML)
[/HTML]
[CSS]
(changes or empty if no changes)
(changes or empty if no changes or entire CSS)
[/CSS]
[JavaScript]
(changes or empty if no changes)
(changes or empty if no changes or entire JavaScript)
[/JavaScript]
Example - Multiple changes in one file:
@ -152,6 +172,68 @@ module DiscourseAi
.text { font-size: 16px; }
>>>>>>> REPLACE
[/CSS]
Example - Non contiguous search in CSS (replace most CSS with new CSS)
Original CSS:
[CSS]
body {
color: red;
}
.button {
color: blue;
}
.alert {
background-color: green;
}
.alert2 {
background-color: green;
}
[/CSS]
[CSS]
<<<<<<< SEARCH
body {
...
background-color: green;
}
=======
body {
color: red;
}
>>>>>>> REPLACE
RESULT:
[CSS]
body {
color: red;
}
.alert2 {
background-color: green;
}
[/CSS]
Example - full HTML replacement:
[HTML]
<div>something old</div>
<div>another somethin old</div>
[/HTML]
output:
[HTML]
<div>something new</div>
[/HTML]
result:
[HTML]
<div>something new</div>
[/HTML]
PROMPT
end

View File

@ -26,6 +26,8 @@ module DiscourseAi
lines = content.split("\n")
search_lines = search.split("\n")
### TODO implement me
# 1. Try exact matching
match_positions =
find_matches(lines, search_lines) { |line, search_line| line == search_line }
@ -38,7 +40,17 @@ module DiscourseAi
end
end
# 3. Try fuzzy matching
# 3. Try non-contiguous line based stripped matching
if match_positions.empty?
if range = non_contiguous_match_range(lines, search_lines)
first_match, last_match = range
lines.slice!(first_match, last_match - first_match + 1)
lines.insert(first_match, *replace.split("\n"))
return lines.join("\n")
end
end
# 4. Try fuzzy matching
if match_positions.empty?
match_positions =
find_matches(lines, search_lines) do |line, search_line|
@ -46,7 +58,7 @@ module DiscourseAi
end
end
# 4. Try block matching as last resort
# 5. Try block matching as last resort
if match_positions.empty?
if block_matches = find_block_matches(content, search)
return replace_blocks(content, block_matches, replace)
@ -68,6 +80,27 @@ module DiscourseAi
private
def non_contiguous_match_range(lines, search_lines)
first_idx = nil
last_idx = nil
search_index = 0
lines.each_with_index do |line, idx|
if search_lines[search_index].strip == "..."
search_index += 1
break if search_lines[search_index].nil?
end
if line.strip == search_lines[search_index].strip
first_idx ||= idx
last_idx = idx
search_index += 1
return first_idx, last_idx if search_index == search_lines.length
end
end
nil
end
def find_matches(lines, search_lines)
matches = []
max_index = lines.length - search_lines.length

View File

@ -0,0 +1,19 @@
# frozen_string_literal: true
Fabricator(:ai_artifact) do
user
post
name { sequence(:name) { |i| "artifact_#{i}" } }
html { "<div>Test Content</div>" }
css { ".test { color: blue; }" }
js { "console.log('test');" }
metadata { { public: false } }
end
Fabricator(:ai_artifact_version) do
ai_artifact
version_number { sequence(:version_number) { |i| i } }
html { "<div>Version Content</div>" }
css { ".version { color: red; }" }
js { "console.log('version');" }
change_description { "Test change" }
end

View File

@ -0,0 +1,214 @@
# frozen_string_literal: true
RSpec.describe DiscourseAi::AiBot::ArtifactUpdateStrategies::Diff do
fab!(:user)
fab!(:post)
fab!(:artifact) { Fabricate(:ai_artifact) }
fab!(:llm_model)
let(:llm) { llm_model.to_llm }
let(:instructions) { "Update the button color to red" }
let(:strategy) do
described_class.new(
llm: llm,
post: post,
user: user,
artifact: artifact,
artifact_version: nil,
instructions: instructions,
)
end
describe "#apply" do
it "processes simple search/replace blocks" do
original_css = ".button { color: blue; }"
artifact.update!(css: original_css)
response = <<~RESPONSE
[CSS]
<<<<<<< SEARCH
.button { color: blue; }
=======
.button { color: red; }
>>>>>>> REPLACE
[/CSS]
RESPONSE
DiscourseAi::Completions::Llm.with_prepared_responses([response]) { strategy.apply }
expect(artifact.versions.last.css).to eq(".button { color: red; }")
end
it "handles multiple search/replace blocks in the same section" do
original_css = <<~CSS
.button { color: blue; }
.text { font-size: 12px; }
CSS
artifact.update!(css: original_css)
response = <<~RESPONSE
[CSS]
<<<<<<< SEARCH
.button { color: blue; }
=======
.button { color: red; }
>>>>>>> REPLACE
<<<<<<< SEARCH
.text { font-size: 12px; }
=======
.text { font-size: 16px; }
>>>>>>> REPLACE
[/CSS]
RESPONSE
DiscourseAi::Completions::Llm.with_prepared_responses([response]) { strategy.apply }
expected = <<~CSS.strip
.button { color: red; }
.text { font-size: 16px; }
CSS
expect(artifact.versions.last.css.strip).to eq(expected.strip)
end
it "handles non-contiguous search/replace using ..." do
original_css = <<~CSS
body {
color: red;
}
.button {
color: blue;
}
.alert {
background-color: green;
}
CSS
artifact.update!(css: original_css)
response = <<~RESPONSE
[CSS]
<<<<<<< SEARCH
body {
...
background-color: green;
}
=======
body {
color: red;
}
>>>>>>> REPLACE
[/CSS]
RESPONSE
DiscourseAi::Completions::Llm.with_prepared_responses([response]) { strategy.apply }
expect(artifact.versions.last.css).to eq("body {\n color: red;\n}")
end
it "tracks failed searches" do
original_css = ".button { color: blue; }"
artifact.update!(css: original_css)
response = <<~RESPONSE
[CSS]
<<<<<<< SEARCH
.button { color: green; }
=======
.button { color: red; }
>>>>>>> REPLACE
[/CSS]
RESPONSE
DiscourseAi::Completions::Llm.with_prepared_responses([response]) { strategy.apply }
expect(strategy.failed_searches).to contain_exactly(
{ section: :css, search: ".button { color: green; }" },
)
expect(artifact.versions.last.css).to eq(original_css)
end
it "handles complete section replacements" do
original_html = "<div>old content</div>"
artifact.update!(html: original_html)
response = <<~RESPONSE
[HTML]
<div>new content</div>
[/HTML]
RESPONSE
DiscourseAi::Completions::Llm.with_prepared_responses([response]) { strategy.apply }
expect(artifact.versions.last.html.strip).to eq("<div>new content</div>")
end
it "ignores empty or 'no changes' sections part 1" do
original = {
html: "<div>content</div>",
css: ".button { color: blue; }",
js: "console.log('test');",
}
artifact.update!(html: original[:html], css: original[:css], js: original[:js])
response = <<~RESPONSE
[HTML]
no changes
[/HTML]
[CSS]
(NO CHANGES)
[/CSS]
[JavaScript]
<<<<<<< SEARCH
console.log('test');
=======
console.log('(no changes)');
>>>>>>> REPLACE
[/JavaScript]
RESPONSE
DiscourseAi::Completions::Llm.with_prepared_responses([response]) { strategy.apply }
version = artifact.versions.last
expect(version.html).to eq(original[:html])
expect(version.css).to eq(original[:css])
expect(version.js).to eq("console.log('(no changes)');")
end
it "ignores empty or 'no changes' section part 2" do
original = {
html: "<div>content</div>",
css: ".button { color: blue; }",
js: "console.log('test');",
}
artifact.update!(html: original[:html], css: original[:css], js: original[:js])
response = <<~RESPONSE
[HTML]
(no changes)
[/HTML]
[CSS]
[/CSS]
[JavaScript]
<<<<<<< SEARCH
console.log('test');
=======
console.log('updated');
>>>>>>> REPLACE
[/JavaScript]
RESPONSE
DiscourseAi::Completions::Llm.with_prepared_responses([response]) { strategy.apply }
version = artifact.versions.last
expect(version.html).to eq(original[:html])
expect(version.css).to eq(original[:css])
expect(version.js).to eq("console.log('updated');")
end
end
end

View File

@ -262,7 +262,7 @@ RSpec.describe DiscourseAi::AiBot::Tools::UpdateArtifact do
[/CSS]
[JavaScript]
nothing to do
no changes
[/JavaScript]
LLMs like to say nonsense that we can ignore here as well

View File

@ -171,5 +171,33 @@ RSpec.describe DiscourseAi::Utils::DiffUtils::SimpleDiff do
expect(subject.apply(content, search, replace).strip).to eq(expected.strip)
end
it "handles missing lines in search" do
original = <<~TEXT
line1
line2
line3
line4
line5
line1
line2
TEXT
search = <<~TEXT
line1
...
line3
...
line1
TEXT
replace = ""
expected = <<~TEXT
line2
TEXT
expect(subject.apply(original, search, replace).strip).to eq(expected.strip)
end
end
end