mirror of
https://github.com/discourse/discourse-ai.git
synced 2025-06-29 02:52:16 +00:00
DEV: improve artifact editing and eval system (#1130)
- Add non-contiguous search/replace support using ... syntax - Add judge support for evaluating LLM outputs with ratings - Improve error handling and reporting in eval runner - Add full section replacement support without search blocks - Add fabricators and specs for artifact diffing - Track failed searches to improve debugging - Add JS syntax validation for artifact versions in eval system - Update prompt documentation with clear guidelines * improve eval output * move error handling * llm as a judge * fix spec * small note on evals
This commit is contained in:
parent
02f0908963
commit
0c9466059c
23
README.md
23
README.md
@ -3,3 +3,26 @@
|
|||||||
**Plugin Summary**
|
**Plugin Summary**
|
||||||
|
|
||||||
For more information, please see: https://meta.discourse.org/t/discourse-ai/259214?u=falco
|
For more information, please see: https://meta.discourse.org/t/discourse-ai/259214?u=falco
|
||||||
|
|
||||||
|
### Evals
|
||||||
|
|
||||||
|
The directory `evals` contains AI evals for the Discourse AI plugin.
|
||||||
|
|
||||||
|
To run them use:
|
||||||
|
|
||||||
|
cd evals
|
||||||
|
./run --help
|
||||||
|
|
||||||
|
```
|
||||||
|
Usage: evals/run [options]
|
||||||
|
-e, --eval NAME Name of the evaluation to run
|
||||||
|
--list-models List models
|
||||||
|
-m, --model NAME Model to evaluate (will eval all models if not specified)
|
||||||
|
-l, --list List evals
|
||||||
|
```
|
||||||
|
|
||||||
|
To run evals you will need to configure API keys in your environment:
|
||||||
|
|
||||||
|
OPENAI_API_KEY=your_openai_api_key
|
||||||
|
ANTHROPIC_API_KEY=your_anthropic_api_key
|
||||||
|
GEMINI_API_KEY=your_gemini_api_key
|
||||||
|
@ -4,6 +4,19 @@ class AiArtifactVersion < ActiveRecord::Base
|
|||||||
validates :html, length: { maximum: 65_535 }
|
validates :html, length: { maximum: 65_535 }
|
||||||
validates :css, length: { maximum: 65_535 }
|
validates :css, length: { maximum: 65_535 }
|
||||||
validates :js, length: { maximum: 65_535 }
|
validates :js, length: { maximum: 65_535 }
|
||||||
|
|
||||||
|
# used when generating test cases
|
||||||
|
def write_to(path)
|
||||||
|
css_path = "#{path}/main.css"
|
||||||
|
html_path = "#{path}/main.html"
|
||||||
|
js_path = "#{path}/main.js"
|
||||||
|
instructions_path = "#{path}/instructions.txt"
|
||||||
|
|
||||||
|
File.write(css_path, css)
|
||||||
|
File.write(html_path, html)
|
||||||
|
File.write(js_path, js)
|
||||||
|
File.write(instructions_path, change_description)
|
||||||
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
# == Schema Information
|
# == Schema Information
|
||||||
|
@ -10,7 +10,17 @@ class DiscourseAi::Evals::Eval
|
|||||||
:vision,
|
:vision,
|
||||||
:expected_output,
|
:expected_output,
|
||||||
:expected_output_regex,
|
:expected_output_regex,
|
||||||
:expected_tool_call
|
:expected_tool_call,
|
||||||
|
:judge
|
||||||
|
|
||||||
|
class EvalError < StandardError
|
||||||
|
attr_reader :context
|
||||||
|
|
||||||
|
def initialize(message, context)
|
||||||
|
super(message)
|
||||||
|
@context = context
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
def initialize(path:)
|
def initialize(path:)
|
||||||
@yaml = YAML.load_file(path).symbolize_keys
|
@yaml = YAML.load_file(path).symbolize_keys
|
||||||
@ -27,10 +37,14 @@ class DiscourseAi::Evals::Eval
|
|||||||
Regexp.new(@expected_output_regex, Regexp::MULTILINE) if @expected_output_regex
|
Regexp.new(@expected_output_regex, Regexp::MULTILINE) if @expected_output_regex
|
||||||
@expected_tool_call = @yaml[:expected_tool_call]
|
@expected_tool_call = @yaml[:expected_tool_call]
|
||||||
@expected_tool_call.symbolize_keys! if @expected_tool_call
|
@expected_tool_call.symbolize_keys! if @expected_tool_call
|
||||||
|
@judge = @yaml[:judge]
|
||||||
|
@judge.symbolize_keys! if @judge
|
||||||
|
|
||||||
@args[:path] = File.expand_path(File.join(File.dirname(path), @args[:path])) if @args&.key?(
|
@args.each do |key, value|
|
||||||
:path,
|
if (key.to_s.include?("_path") || key.to_s == "path") && value.is_a?(String)
|
||||||
)
|
@args[key] = File.expand_path(File.join(File.dirname(path), value))
|
||||||
|
end
|
||||||
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
def run(llm:)
|
def run(llm:)
|
||||||
@ -44,6 +58,8 @@ class DiscourseAi::Evals::Eval
|
|||||||
image_to_text(llm, **args)
|
image_to_text(llm, **args)
|
||||||
when "prompt"
|
when "prompt"
|
||||||
prompt_call(llm, **args)
|
prompt_call(llm, **args)
|
||||||
|
when "edit_artifact"
|
||||||
|
edit_artifact(llm, **args)
|
||||||
end
|
end
|
||||||
|
|
||||||
if expected_output
|
if expected_output
|
||||||
@ -53,7 +69,7 @@ class DiscourseAi::Evals::Eval
|
|||||||
{ result: :fail, expected_output: expected_output, actual_output: result }
|
{ result: :fail, expected_output: expected_output, actual_output: result }
|
||||||
end
|
end
|
||||||
elsif expected_output_regex
|
elsif expected_output_regex
|
||||||
if result.match?(expected_output_regex)
|
if result.to_s.match?(expected_output_regex)
|
||||||
{ result: :pass }
|
{ result: :pass }
|
||||||
else
|
else
|
||||||
{ result: :fail, expected_output: expected_output_regex, actual_output: result }
|
{ result: :fail, expected_output: expected_output_regex, actual_output: result }
|
||||||
@ -71,9 +87,13 @@ class DiscourseAi::Evals::Eval
|
|||||||
else
|
else
|
||||||
{ result: :pass }
|
{ result: :pass }
|
||||||
end
|
end
|
||||||
|
elsif judge
|
||||||
|
judge_result(result)
|
||||||
else
|
else
|
||||||
{ result: :unknown, actual_output: result }
|
{ result: :pass }
|
||||||
end
|
end
|
||||||
|
rescue EvalError => e
|
||||||
|
{ result: :fail, message: e.message, context: e.context }
|
||||||
end
|
end
|
||||||
|
|
||||||
def print
|
def print
|
||||||
@ -96,14 +116,68 @@ class DiscourseAi::Evals::Eval
|
|||||||
|
|
||||||
private
|
private
|
||||||
|
|
||||||
def helper(llm, input:, name:)
|
def judge_result(result)
|
||||||
|
prompt = judge[:prompt].dup
|
||||||
|
prompt.sub!("{{output}}", result)
|
||||||
|
prompt.sub!("{{input}}", args[:input])
|
||||||
|
|
||||||
|
prompt += <<~SUFFIX
|
||||||
|
|
||||||
|
Reply with a rating from 1 to 10, where 10 is perfect and 1 is terrible.
|
||||||
|
|
||||||
|
example output:
|
||||||
|
|
||||||
|
[RATING]10[/RATING] perfect output
|
||||||
|
|
||||||
|
example output:
|
||||||
|
|
||||||
|
[RATING]5[/RATING]
|
||||||
|
|
||||||
|
the following failed to preserve... etc...
|
||||||
|
SUFFIX
|
||||||
|
|
||||||
|
judge_llm = DiscourseAi::Evals::Llm.choose(judge[:llm]).first
|
||||||
|
|
||||||
|
DiscourseAi::Completions::Prompt.new(
|
||||||
|
"You are an expert judge tasked at testing LLM outputs.",
|
||||||
|
messages: [{ type: :user, content: prompt }],
|
||||||
|
)
|
||||||
|
|
||||||
|
result = judge_llm.llm_model.to_llm.generate(prompt, user: Discourse.system_user)
|
||||||
|
|
||||||
|
if rating = result.match(%r{\[RATING\](\d+)\[/RATING\]})
|
||||||
|
rating = rating[1].to_i
|
||||||
|
end
|
||||||
|
|
||||||
|
if rating.to_i >= judge[:pass_rating]
|
||||||
|
{ result: :pass }
|
||||||
|
else
|
||||||
|
{
|
||||||
|
result: :fail,
|
||||||
|
message: "LLM Rating below threshold, it was #{rating}, expecting #{judge[:pass_rating]}",
|
||||||
|
context: result,
|
||||||
|
}
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
def helper(llm, input:, name:, locale: nil)
|
||||||
completion_prompt = CompletionPrompt.find_by(name: name)
|
completion_prompt = CompletionPrompt.find_by(name: name)
|
||||||
helper = DiscourseAi::AiHelper::Assistant.new(helper_llm: llm.llm_proxy)
|
helper = DiscourseAi::AiHelper::Assistant.new(helper_llm: llm.llm_proxy)
|
||||||
|
user = Discourse.system_user
|
||||||
|
if locale
|
||||||
|
user = User.new
|
||||||
|
class << user
|
||||||
|
attr_accessor :effective_locale
|
||||||
|
end
|
||||||
|
|
||||||
|
user.effective_locale = locale
|
||||||
|
user.admin = true
|
||||||
|
end
|
||||||
result =
|
result =
|
||||||
helper.generate_and_send_prompt(
|
helper.generate_and_send_prompt(
|
||||||
completion_prompt,
|
completion_prompt,
|
||||||
input,
|
input,
|
||||||
current_user = Discourse.system_user,
|
current_user = user,
|
||||||
_force_default_locale = false,
|
_force_default_locale = false,
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -169,4 +243,73 @@ class DiscourseAi::Evals::Eval
|
|||||||
end
|
end
|
||||||
result
|
result
|
||||||
end
|
end
|
||||||
|
|
||||||
|
def edit_artifact(llm, css_path:, js_path:, html_path:, instructions_path:)
|
||||||
|
css = File.read(css_path)
|
||||||
|
js = File.read(js_path)
|
||||||
|
html = File.read(html_path)
|
||||||
|
instructions = File.read(instructions_path)
|
||||||
|
artifact =
|
||||||
|
AiArtifact.create!(
|
||||||
|
css: css,
|
||||||
|
js: js,
|
||||||
|
html: html,
|
||||||
|
user_id: Discourse.system_user.id,
|
||||||
|
post_id: 1,
|
||||||
|
name: "eval artifact",
|
||||||
|
)
|
||||||
|
|
||||||
|
post = Post.new(topic_id: 1, id: 1)
|
||||||
|
diff =
|
||||||
|
DiscourseAi::AiBot::ArtifactUpdateStrategies::Diff.new(
|
||||||
|
llm: llm.llm_model.to_llm,
|
||||||
|
post: post,
|
||||||
|
user: Discourse.system_user,
|
||||||
|
artifact: artifact,
|
||||||
|
artifact_version: nil,
|
||||||
|
instructions: instructions,
|
||||||
|
)
|
||||||
|
diff.apply
|
||||||
|
|
||||||
|
if diff.failed_searches.present?
|
||||||
|
puts "Eval Errors encountered"
|
||||||
|
p diff.failed_searches
|
||||||
|
raise EvalError.new("Failed to apply all changes", diff.failed_searches)
|
||||||
|
end
|
||||||
|
|
||||||
|
version = artifact.versions.last
|
||||||
|
raise EvalError.new("Invalid JS", version.js) if !valid_javascript?(version.js)
|
||||||
|
|
||||||
|
output = { css: version.css, js: version.js, html: version.html }
|
||||||
|
|
||||||
|
artifact.destroy
|
||||||
|
output
|
||||||
|
end
|
||||||
|
|
||||||
|
def valid_javascript?(str)
|
||||||
|
require "open3"
|
||||||
|
|
||||||
|
# Create a temporary file with the JavaScript code
|
||||||
|
Tempfile.create(%w[test .js]) do |f|
|
||||||
|
f.write(str)
|
||||||
|
f.flush
|
||||||
|
|
||||||
|
File.write("/tmp/test.js", str)
|
||||||
|
|
||||||
|
begin
|
||||||
|
Discourse::Utils.execute_command(
|
||||||
|
"node",
|
||||||
|
"--check",
|
||||||
|
f.path,
|
||||||
|
failure_message: "Invalid JavaScript syntax",
|
||||||
|
timeout: 30, # reasonable timeout in seconds
|
||||||
|
)
|
||||||
|
true
|
||||||
|
rescue Discourse::Utils::CommandError
|
||||||
|
false
|
||||||
|
end
|
||||||
|
end
|
||||||
|
rescue StandardError
|
||||||
|
false
|
||||||
|
end
|
||||||
end
|
end
|
||||||
|
@ -155,9 +155,18 @@ class DiscourseAi::Evals::Runner
|
|||||||
|
|
||||||
if result[:result] == :fail
|
if result[:result] == :fail
|
||||||
puts "Failed 🔴"
|
puts "Failed 🔴"
|
||||||
|
puts "Error: #{result[:message]}" if result[:message]
|
||||||
|
# this is deliberate, it creates a lot of noise, but sometimes for debugging it's useful
|
||||||
|
#puts "Context: #{result[:context].to_s[0..2000]}" if result[:context]
|
||||||
|
if result[:expected_output] && result[:actual_output]
|
||||||
puts "---- Expected ----\n#{result[:expected_output]}"
|
puts "---- Expected ----\n#{result[:expected_output]}"
|
||||||
puts "---- Actual ----\n#{result[:actual_output]}"
|
puts "---- Actual ----\n#{result[:actual_output]}"
|
||||||
|
end
|
||||||
logger.error("Evaluation failed with LLM: #{llm.name}")
|
logger.error("Evaluation failed with LLM: #{llm.name}")
|
||||||
|
logger.error("Error: #{result[:message]}") if result[:message]
|
||||||
|
logger.error("Expected: #{result[:expected_output]}") if result[:expected_output]
|
||||||
|
logger.error("Actual: #{result[:actual_output]}") if result[:actual_output]
|
||||||
|
logger.error("Context: #{result[:context]}") if result[:context]
|
||||||
elsif result[:result] == :pass
|
elsif result[:result] == :pass
|
||||||
puts "Passed 🟢"
|
puts "Passed 🟢"
|
||||||
logger.info("Evaluation passed with LLM: #{llm.name}")
|
logger.info("Evaluation passed with LLM: #{llm.name}")
|
||||||
|
@ -3,8 +3,15 @@ module DiscourseAi
|
|||||||
module AiBot
|
module AiBot
|
||||||
module ArtifactUpdateStrategies
|
module ArtifactUpdateStrategies
|
||||||
class Diff < Base
|
class Diff < Base
|
||||||
|
attr_reader :failed_searches
|
||||||
|
|
||||||
private
|
private
|
||||||
|
|
||||||
|
def initialize(**kwargs)
|
||||||
|
super
|
||||||
|
@failed_searches = []
|
||||||
|
end
|
||||||
|
|
||||||
def build_prompt
|
def build_prompt
|
||||||
DiscourseAi::Completions::Prompt.new(
|
DiscourseAi::Completions::Prompt.new(
|
||||||
system_prompt,
|
system_prompt,
|
||||||
@ -51,15 +58,21 @@ module DiscourseAi
|
|||||||
content = source.public_send(section == :javascript ? :js : section)
|
content = source.public_send(section == :javascript ? :js : section)
|
||||||
blocks.each do |block|
|
blocks.each do |block|
|
||||||
begin
|
begin
|
||||||
|
if !block[:search]
|
||||||
|
content = block[:replace]
|
||||||
|
else
|
||||||
content =
|
content =
|
||||||
DiscourseAi::Utils::DiffUtils::SimpleDiff.apply(
|
DiscourseAi::Utils::DiffUtils::SimpleDiff.apply(
|
||||||
content,
|
content,
|
||||||
block[:search],
|
block[:search],
|
||||||
block[:replace],
|
block[:replace],
|
||||||
)
|
)
|
||||||
|
end
|
||||||
rescue DiscourseAi::Utils::DiffUtils::SimpleDiff::NoMatchError
|
rescue DiscourseAi::Utils::DiffUtils::SimpleDiff::NoMatchError
|
||||||
|
@failed_searches << { section: section, search: block[:search] }
|
||||||
# TODO, we may need to inform caller here, LLM made a mistake which it
|
# TODO, we may need to inform caller here, LLM made a mistake which it
|
||||||
# should correct
|
# should correct
|
||||||
|
puts "Failed to find search: #{block[:search]}"
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
updated_content[section == :javascript ? :js : section] = content
|
updated_content[section == :javascript ? :js : section] = content
|
||||||
@ -76,7 +89,8 @@ module DiscourseAi
|
|||||||
private
|
private
|
||||||
|
|
||||||
def extract_search_replace_blocks(content)
|
def extract_search_replace_blocks(content)
|
||||||
return nil if content.blank?
|
return nil if content.blank? || content.to_s.strip.downcase.match?(/^\(?no changes?\)?$/m)
|
||||||
|
return [{ replace: content }] if !content.match?(/<<+\s*SEARCH/)
|
||||||
|
|
||||||
blocks = []
|
blocks = []
|
||||||
remaining = content
|
remaining = content
|
||||||
@ -98,29 +112,35 @@ module DiscourseAi
|
|||||||
|
|
||||||
1. Use EXACTLY this format for changes:
|
1. Use EXACTLY this format for changes:
|
||||||
<<<<<<< SEARCH
|
<<<<<<< SEARCH
|
||||||
(exact code to find)
|
(first line of code to replace)
|
||||||
|
(other lines of code to avoid ambiguity)
|
||||||
|
(last line of code to replace)
|
||||||
=======
|
=======
|
||||||
(replacement code)
|
(replacement code)
|
||||||
>>>>>>> REPLACE
|
>>>>>>> REPLACE
|
||||||
2. DO NOT modify the markers or add spaces around them
|
2. DO NOT modify the markers or add spaces around them
|
||||||
3. DO NOT add explanations or comments within sections
|
3. DO NOT add explanations or comments within sections
|
||||||
4. ONLY include [HTML], [CSS], and [JavaScript] sections if they have changes
|
4. ONLY include [HTML], [CSS], and [JavaScript] sections if they have changes
|
||||||
5. Ensure search text matches EXACTLY - partial matches will fail
|
5. HTML should not include <html>, <head>, or <body> tags, it is injected into a template
|
||||||
6. Keep changes minimal and focused
|
6. When specifying a SEARCH block, ALWAYS keep it 8 lines or less, you will be interrupted and a retry will be required if you exceed this limit
|
||||||
7. HTML should not include <html>, <head>, or <body> tags, it is injected into a template
|
7. NEVER EVER ask followup questions, ALL changes must be performed in a single response, you are consumed via an API, there is no opportunity for humans in the loop
|
||||||
|
8. When performing a non-contiguous search, ALWAYS use ... to denote the skipped lines
|
||||||
|
9. Be mindful that ... non-contiguous search is not greedy, the following line will only match the first occurrence of the search block
|
||||||
|
10. Never mix a full section replacement with a search/replace block in the same section
|
||||||
|
11. ALWAYS skip sections you to not want to change, do not include them in the response
|
||||||
|
|
||||||
JavaScript libraries must be sourced from the following CDNs, otherwise CSP will reject it:
|
JavaScript libraries must be sourced from the following CDNs, otherwise CSP will reject it:
|
||||||
#{AiArtifact::ALLOWED_CDN_SOURCES.join("\n")}
|
#{AiArtifact::ALLOWED_CDN_SOURCES.join("\n")}
|
||||||
|
|
||||||
Reply Format:
|
Reply Format:
|
||||||
[HTML]
|
[HTML]
|
||||||
(changes or empty if no changes)
|
(changes or empty if no changes or entire HTML)
|
||||||
[/HTML]
|
[/HTML]
|
||||||
[CSS]
|
[CSS]
|
||||||
(changes or empty if no changes)
|
(changes or empty if no changes or entire CSS)
|
||||||
[/CSS]
|
[/CSS]
|
||||||
[JavaScript]
|
[JavaScript]
|
||||||
(changes or empty if no changes)
|
(changes or empty if no changes or entire JavaScript)
|
||||||
[/JavaScript]
|
[/JavaScript]
|
||||||
|
|
||||||
Example - Multiple changes in one file:
|
Example - Multiple changes in one file:
|
||||||
@ -152,6 +172,68 @@ module DiscourseAi
|
|||||||
.text { font-size: 16px; }
|
.text { font-size: 16px; }
|
||||||
>>>>>>> REPLACE
|
>>>>>>> REPLACE
|
||||||
[/CSS]
|
[/CSS]
|
||||||
|
|
||||||
|
Example - Non contiguous search in CSS (replace most CSS with new CSS)
|
||||||
|
|
||||||
|
Original CSS:
|
||||||
|
|
||||||
|
[CSS]
|
||||||
|
body {
|
||||||
|
color: red;
|
||||||
|
}
|
||||||
|
.button {
|
||||||
|
color: blue;
|
||||||
|
}
|
||||||
|
.alert {
|
||||||
|
background-color: green;
|
||||||
|
}
|
||||||
|
.alert2 {
|
||||||
|
background-color: green;
|
||||||
|
}
|
||||||
|
[/CSS]
|
||||||
|
|
||||||
|
[CSS]
|
||||||
|
<<<<<<< SEARCH
|
||||||
|
body {
|
||||||
|
...
|
||||||
|
background-color: green;
|
||||||
|
}
|
||||||
|
=======
|
||||||
|
body {
|
||||||
|
color: red;
|
||||||
|
}
|
||||||
|
>>>>>>> REPLACE
|
||||||
|
|
||||||
|
RESULT:
|
||||||
|
|
||||||
|
[CSS]
|
||||||
|
body {
|
||||||
|
color: red;
|
||||||
|
}
|
||||||
|
.alert2 {
|
||||||
|
background-color: green;
|
||||||
|
}
|
||||||
|
[/CSS]
|
||||||
|
|
||||||
|
Example - full HTML replacement:
|
||||||
|
|
||||||
|
[HTML]
|
||||||
|
<div>something old</div>
|
||||||
|
<div>another somethin old</div>
|
||||||
|
[/HTML]
|
||||||
|
|
||||||
|
output:
|
||||||
|
|
||||||
|
[HTML]
|
||||||
|
<div>something new</div>
|
||||||
|
[/HTML]
|
||||||
|
|
||||||
|
result:
|
||||||
|
[HTML]
|
||||||
|
<div>something new</div>
|
||||||
|
[/HTML]
|
||||||
|
|
||||||
|
|
||||||
PROMPT
|
PROMPT
|
||||||
end
|
end
|
||||||
|
|
||||||
|
@ -26,6 +26,8 @@ module DiscourseAi
|
|||||||
lines = content.split("\n")
|
lines = content.split("\n")
|
||||||
search_lines = search.split("\n")
|
search_lines = search.split("\n")
|
||||||
|
|
||||||
|
### TODO implement me
|
||||||
|
|
||||||
# 1. Try exact matching
|
# 1. Try exact matching
|
||||||
match_positions =
|
match_positions =
|
||||||
find_matches(lines, search_lines) { |line, search_line| line == search_line }
|
find_matches(lines, search_lines) { |line, search_line| line == search_line }
|
||||||
@ -38,7 +40,17 @@ module DiscourseAi
|
|||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
# 3. Try fuzzy matching
|
# 3. Try non-contiguous line based stripped matching
|
||||||
|
if match_positions.empty?
|
||||||
|
if range = non_contiguous_match_range(lines, search_lines)
|
||||||
|
first_match, last_match = range
|
||||||
|
lines.slice!(first_match, last_match - first_match + 1)
|
||||||
|
lines.insert(first_match, *replace.split("\n"))
|
||||||
|
return lines.join("\n")
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
# 4. Try fuzzy matching
|
||||||
if match_positions.empty?
|
if match_positions.empty?
|
||||||
match_positions =
|
match_positions =
|
||||||
find_matches(lines, search_lines) do |line, search_line|
|
find_matches(lines, search_lines) do |line, search_line|
|
||||||
@ -46,7 +58,7 @@ module DiscourseAi
|
|||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
# 4. Try block matching as last resort
|
# 5. Try block matching as last resort
|
||||||
if match_positions.empty?
|
if match_positions.empty?
|
||||||
if block_matches = find_block_matches(content, search)
|
if block_matches = find_block_matches(content, search)
|
||||||
return replace_blocks(content, block_matches, replace)
|
return replace_blocks(content, block_matches, replace)
|
||||||
@ -68,6 +80,27 @@ module DiscourseAi
|
|||||||
|
|
||||||
private
|
private
|
||||||
|
|
||||||
|
def non_contiguous_match_range(lines, search_lines)
|
||||||
|
first_idx = nil
|
||||||
|
last_idx = nil
|
||||||
|
search_index = 0
|
||||||
|
|
||||||
|
lines.each_with_index do |line, idx|
|
||||||
|
if search_lines[search_index].strip == "..."
|
||||||
|
search_index += 1
|
||||||
|
break if search_lines[search_index].nil?
|
||||||
|
end
|
||||||
|
if line.strip == search_lines[search_index].strip
|
||||||
|
first_idx ||= idx
|
||||||
|
last_idx = idx
|
||||||
|
search_index += 1
|
||||||
|
return first_idx, last_idx if search_index == search_lines.length
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
nil
|
||||||
|
end
|
||||||
|
|
||||||
def find_matches(lines, search_lines)
|
def find_matches(lines, search_lines)
|
||||||
matches = []
|
matches = []
|
||||||
max_index = lines.length - search_lines.length
|
max_index = lines.length - search_lines.length
|
||||||
|
19
spec/fabricators/ai_artifact_fabricator.rb
Normal file
19
spec/fabricators/ai_artifact_fabricator.rb
Normal file
@ -0,0 +1,19 @@
|
|||||||
|
# frozen_string_literal: true
|
||||||
|
Fabricator(:ai_artifact) do
|
||||||
|
user
|
||||||
|
post
|
||||||
|
name { sequence(:name) { |i| "artifact_#{i}" } }
|
||||||
|
html { "<div>Test Content</div>" }
|
||||||
|
css { ".test { color: blue; }" }
|
||||||
|
js { "console.log('test');" }
|
||||||
|
metadata { { public: false } }
|
||||||
|
end
|
||||||
|
|
||||||
|
Fabricator(:ai_artifact_version) do
|
||||||
|
ai_artifact
|
||||||
|
version_number { sequence(:version_number) { |i| i } }
|
||||||
|
html { "<div>Version Content</div>" }
|
||||||
|
css { ".version { color: red; }" }
|
||||||
|
js { "console.log('version');" }
|
||||||
|
change_description { "Test change" }
|
||||||
|
end
|
214
spec/lib/modules/ai_bot/artifact_update_strategies/diff_spec.rb
Normal file
214
spec/lib/modules/ai_bot/artifact_update_strategies/diff_spec.rb
Normal file
@ -0,0 +1,214 @@
|
|||||||
|
# frozen_string_literal: true
|
||||||
|
|
||||||
|
RSpec.describe DiscourseAi::AiBot::ArtifactUpdateStrategies::Diff do
|
||||||
|
fab!(:user)
|
||||||
|
fab!(:post)
|
||||||
|
fab!(:artifact) { Fabricate(:ai_artifact) }
|
||||||
|
fab!(:llm_model)
|
||||||
|
|
||||||
|
let(:llm) { llm_model.to_llm }
|
||||||
|
let(:instructions) { "Update the button color to red" }
|
||||||
|
|
||||||
|
let(:strategy) do
|
||||||
|
described_class.new(
|
||||||
|
llm: llm,
|
||||||
|
post: post,
|
||||||
|
user: user,
|
||||||
|
artifact: artifact,
|
||||||
|
artifact_version: nil,
|
||||||
|
instructions: instructions,
|
||||||
|
)
|
||||||
|
end
|
||||||
|
|
||||||
|
describe "#apply" do
|
||||||
|
it "processes simple search/replace blocks" do
|
||||||
|
original_css = ".button { color: blue; }"
|
||||||
|
artifact.update!(css: original_css)
|
||||||
|
|
||||||
|
response = <<~RESPONSE
|
||||||
|
[CSS]
|
||||||
|
<<<<<<< SEARCH
|
||||||
|
.button { color: blue; }
|
||||||
|
=======
|
||||||
|
.button { color: red; }
|
||||||
|
>>>>>>> REPLACE
|
||||||
|
[/CSS]
|
||||||
|
RESPONSE
|
||||||
|
|
||||||
|
DiscourseAi::Completions::Llm.with_prepared_responses([response]) { strategy.apply }
|
||||||
|
|
||||||
|
expect(artifact.versions.last.css).to eq(".button { color: red; }")
|
||||||
|
end
|
||||||
|
|
||||||
|
it "handles multiple search/replace blocks in the same section" do
|
||||||
|
original_css = <<~CSS
|
||||||
|
.button { color: blue; }
|
||||||
|
.text { font-size: 12px; }
|
||||||
|
CSS
|
||||||
|
|
||||||
|
artifact.update!(css: original_css)
|
||||||
|
|
||||||
|
response = <<~RESPONSE
|
||||||
|
[CSS]
|
||||||
|
<<<<<<< SEARCH
|
||||||
|
.button { color: blue; }
|
||||||
|
=======
|
||||||
|
.button { color: red; }
|
||||||
|
>>>>>>> REPLACE
|
||||||
|
<<<<<<< SEARCH
|
||||||
|
.text { font-size: 12px; }
|
||||||
|
=======
|
||||||
|
.text { font-size: 16px; }
|
||||||
|
>>>>>>> REPLACE
|
||||||
|
[/CSS]
|
||||||
|
RESPONSE
|
||||||
|
|
||||||
|
DiscourseAi::Completions::Llm.with_prepared_responses([response]) { strategy.apply }
|
||||||
|
|
||||||
|
expected = <<~CSS.strip
|
||||||
|
.button { color: red; }
|
||||||
|
.text { font-size: 16px; }
|
||||||
|
CSS
|
||||||
|
|
||||||
|
expect(artifact.versions.last.css.strip).to eq(expected.strip)
|
||||||
|
end
|
||||||
|
|
||||||
|
it "handles non-contiguous search/replace using ..." do
|
||||||
|
original_css = <<~CSS
|
||||||
|
body {
|
||||||
|
color: red;
|
||||||
|
}
|
||||||
|
.button {
|
||||||
|
color: blue;
|
||||||
|
}
|
||||||
|
.alert {
|
||||||
|
background-color: green;
|
||||||
|
}
|
||||||
|
CSS
|
||||||
|
|
||||||
|
artifact.update!(css: original_css)
|
||||||
|
|
||||||
|
response = <<~RESPONSE
|
||||||
|
[CSS]
|
||||||
|
<<<<<<< SEARCH
|
||||||
|
body {
|
||||||
|
...
|
||||||
|
background-color: green;
|
||||||
|
}
|
||||||
|
=======
|
||||||
|
body {
|
||||||
|
color: red;
|
||||||
|
}
|
||||||
|
>>>>>>> REPLACE
|
||||||
|
[/CSS]
|
||||||
|
RESPONSE
|
||||||
|
|
||||||
|
DiscourseAi::Completions::Llm.with_prepared_responses([response]) { strategy.apply }
|
||||||
|
|
||||||
|
expect(artifact.versions.last.css).to eq("body {\n color: red;\n}")
|
||||||
|
end
|
||||||
|
|
||||||
|
it "tracks failed searches" do
|
||||||
|
original_css = ".button { color: blue; }"
|
||||||
|
artifact.update!(css: original_css)
|
||||||
|
|
||||||
|
response = <<~RESPONSE
|
||||||
|
[CSS]
|
||||||
|
<<<<<<< SEARCH
|
||||||
|
.button { color: green; }
|
||||||
|
=======
|
||||||
|
.button { color: red; }
|
||||||
|
>>>>>>> REPLACE
|
||||||
|
[/CSS]
|
||||||
|
RESPONSE
|
||||||
|
|
||||||
|
DiscourseAi::Completions::Llm.with_prepared_responses([response]) { strategy.apply }
|
||||||
|
|
||||||
|
expect(strategy.failed_searches).to contain_exactly(
|
||||||
|
{ section: :css, search: ".button { color: green; }" },
|
||||||
|
)
|
||||||
|
expect(artifact.versions.last.css).to eq(original_css)
|
||||||
|
end
|
||||||
|
|
||||||
|
it "handles complete section replacements" do
|
||||||
|
original_html = "<div>old content</div>"
|
||||||
|
artifact.update!(html: original_html)
|
||||||
|
|
||||||
|
response = <<~RESPONSE
|
||||||
|
[HTML]
|
||||||
|
<div>new content</div>
|
||||||
|
[/HTML]
|
||||||
|
RESPONSE
|
||||||
|
|
||||||
|
DiscourseAi::Completions::Llm.with_prepared_responses([response]) { strategy.apply }
|
||||||
|
|
||||||
|
expect(artifact.versions.last.html.strip).to eq("<div>new content</div>")
|
||||||
|
end
|
||||||
|
|
||||||
|
it "ignores empty or 'no changes' sections part 1" do
|
||||||
|
original = {
|
||||||
|
html: "<div>content</div>",
|
||||||
|
css: ".button { color: blue; }",
|
||||||
|
js: "console.log('test');",
|
||||||
|
}
|
||||||
|
|
||||||
|
artifact.update!(html: original[:html], css: original[:css], js: original[:js])
|
||||||
|
|
||||||
|
response = <<~RESPONSE
|
||||||
|
[HTML]
|
||||||
|
no changes
|
||||||
|
[/HTML]
|
||||||
|
[CSS]
|
||||||
|
(NO CHANGES)
|
||||||
|
[/CSS]
|
||||||
|
[JavaScript]
|
||||||
|
<<<<<<< SEARCH
|
||||||
|
console.log('test');
|
||||||
|
=======
|
||||||
|
console.log('(no changes)');
|
||||||
|
>>>>>>> REPLACE
|
||||||
|
[/JavaScript]
|
||||||
|
RESPONSE
|
||||||
|
|
||||||
|
DiscourseAi::Completions::Llm.with_prepared_responses([response]) { strategy.apply }
|
||||||
|
|
||||||
|
version = artifact.versions.last
|
||||||
|
expect(version.html).to eq(original[:html])
|
||||||
|
expect(version.css).to eq(original[:css])
|
||||||
|
expect(version.js).to eq("console.log('(no changes)');")
|
||||||
|
end
|
||||||
|
|
||||||
|
it "ignores empty or 'no changes' section part 2" do
|
||||||
|
original = {
|
||||||
|
html: "<div>content</div>",
|
||||||
|
css: ".button { color: blue; }",
|
||||||
|
js: "console.log('test');",
|
||||||
|
}
|
||||||
|
|
||||||
|
artifact.update!(html: original[:html], css: original[:css], js: original[:js])
|
||||||
|
|
||||||
|
response = <<~RESPONSE
|
||||||
|
[HTML]
|
||||||
|
(no changes)
|
||||||
|
[/HTML]
|
||||||
|
[CSS]
|
||||||
|
|
||||||
|
[/CSS]
|
||||||
|
[JavaScript]
|
||||||
|
<<<<<<< SEARCH
|
||||||
|
console.log('test');
|
||||||
|
=======
|
||||||
|
console.log('updated');
|
||||||
|
>>>>>>> REPLACE
|
||||||
|
[/JavaScript]
|
||||||
|
RESPONSE
|
||||||
|
|
||||||
|
DiscourseAi::Completions::Llm.with_prepared_responses([response]) { strategy.apply }
|
||||||
|
|
||||||
|
version = artifact.versions.last
|
||||||
|
expect(version.html).to eq(original[:html])
|
||||||
|
expect(version.css).to eq(original[:css])
|
||||||
|
expect(version.js).to eq("console.log('updated');")
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
@ -262,7 +262,7 @@ RSpec.describe DiscourseAi::AiBot::Tools::UpdateArtifact do
|
|||||||
[/CSS]
|
[/CSS]
|
||||||
|
|
||||||
[JavaScript]
|
[JavaScript]
|
||||||
nothing to do
|
no changes
|
||||||
[/JavaScript]
|
[/JavaScript]
|
||||||
|
|
||||||
LLMs like to say nonsense that we can ignore here as well
|
LLMs like to say nonsense that we can ignore here as well
|
||||||
|
@ -171,5 +171,33 @@ RSpec.describe DiscourseAi::Utils::DiffUtils::SimpleDiff do
|
|||||||
|
|
||||||
expect(subject.apply(content, search, replace).strip).to eq(expected.strip)
|
expect(subject.apply(content, search, replace).strip).to eq(expected.strip)
|
||||||
end
|
end
|
||||||
|
|
||||||
|
it "handles missing lines in search" do
|
||||||
|
original = <<~TEXT
|
||||||
|
line1
|
||||||
|
line2
|
||||||
|
line3
|
||||||
|
line4
|
||||||
|
line5
|
||||||
|
line1
|
||||||
|
line2
|
||||||
|
TEXT
|
||||||
|
|
||||||
|
search = <<~TEXT
|
||||||
|
line1
|
||||||
|
...
|
||||||
|
line3
|
||||||
|
...
|
||||||
|
line1
|
||||||
|
TEXT
|
||||||
|
|
||||||
|
replace = ""
|
||||||
|
|
||||||
|
expected = <<~TEXT
|
||||||
|
line2
|
||||||
|
TEXT
|
||||||
|
|
||||||
|
expect(subject.apply(original, search, replace).strip).to eq(expected.strip)
|
||||||
|
end
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
Loading…
x
Reference in New Issue
Block a user