FIX: throwing away first file in diff (#573)

We were chucking out the first file in a PR diff due to a
logic bug
This commit is contained in:
Sam 2024-04-11 13:26:58 +10:00 committed by GitHub
parent 0cbbf130b9
commit 3b0cfdbe5c
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
6 changed files with 48 additions and 14 deletions

View File

@ -63,7 +63,7 @@ module DiscourseAi
if response_code == "200" if response_code == "200"
diff = body diff = body
diff = sort_and_shorten_diff(diff) diff = self.class.sort_and_shorten_diff(diff)
diff = truncate(diff, max_length: 20_000, percent_length: 0.3, llm: llm) diff = truncate(diff, max_length: 20_000, percent_length: 0.3, llm: llm)
{ diff: diff } { diff: diff }
else else
@ -75,14 +75,12 @@ module DiscourseAi
{ repo: repo, pull_id: pull_id, url: url } { repo: repo, pull_id: pull_id, url: url }
end end
private def self.sort_and_shorten_diff(diff, threshold: LARGE_OBJECT_THRESHOLD)
def sort_and_shorten_diff(diff, threshold: LARGE_OBJECT_THRESHOLD)
# This regex matches the start of a new file in the diff, # This regex matches the start of a new file in the diff,
# capturing the file paths for later use. # capturing the file paths for later use.
file_start_regex = /^diff --git.*/ file_start_regex = /^diff --git.*/
prev_start = 0 prev_start = -1
prev_match = nil prev_match = nil
split = [] split = []
@ -90,9 +88,8 @@ module DiscourseAi
diff.scan(file_start_regex) do |match| diff.scan(file_start_regex) do |match|
match_start = $~.offset(0)[0] # Get the start position of this match match_start = $~.offset(0)[0] # Get the start position of this match
if prev_start != 0 if prev_start != -1
full_diff = diff[prev_start...match_start] full_diff = diff[prev_start...match_start]
split << [prev_match, full_diff] split << [prev_match, full_diff]
end end

View File

@ -33,7 +33,7 @@ module DiscourseAi
elsif msg[:type] == :model elsif msg[:type] == :model
memo << "\n#{msg[:content]}" memo << "\n#{msg[:content]}"
elsif msg[:type] == :tool elsif msg[:type] == :tool
tool = JSON.parse(msg[:content], symbolize_names: true) JSON.parse(msg[:content], symbolize_names: true)
memo << "\n[INST]\n" memo << "\n[INST]\n"
memo << (<<~TEXT).strip memo << (<<~TEXT).strip

View File

@ -12,7 +12,7 @@ RSpec.describe DiscourseAi::Completions::Dialects::Llama2Classic do
[INST] [INST]
<<SYS>> <<SYS>>
#{context.system_insts} #{context.system_insts}
#{described_class.tool_preamble} #{described_class.tool_preamble(include_array_tip: false)}
<tools> <tools>
#{context.dialect_tools}</tools> #{context.dialect_tools}</tools>
<</SYS>> <</SYS>>
@ -30,7 +30,7 @@ RSpec.describe DiscourseAi::Completions::Dialects::Llama2Classic do
[INST] [INST]
<<SYS>> <<SYS>>
#{context.system_insts} #{context.system_insts}
#{described_class.tool_preamble} #{described_class.tool_preamble(include_array_tip: false)}
<tools> <tools>
#{context.dialect_tools}</tools> #{context.dialect_tools}</tools>
<</SYS>> <</SYS>>

View File

@ -11,7 +11,7 @@ RSpec.describe DiscourseAi::Completions::Dialects::Mixtral do
llama2_classic_version = <<~TEXT llama2_classic_version = <<~TEXT
<s> [INST] <s> [INST]
#{context.system_insts} #{context.system_insts}
#{described_class.tool_preamble} #{described_class.tool_preamble(include_array_tip: false)}
<tools> <tools>
#{context.dialect_tools}</tools> #{context.dialect_tools}</tools>
[/INST] Ok </s> [/INST] Ok </s>
@ -27,7 +27,7 @@ RSpec.describe DiscourseAi::Completions::Dialects::Mixtral do
expected = +(<<~TEXT).strip expected = +(<<~TEXT).strip
<s> [INST] <s> [INST]
#{context.system_insts} #{context.system_insts}
#{described_class.tool_preamble} #{described_class.tool_preamble(include_array_tip: false)}
<tools> <tools>
#{context.dialect_tools}</tools> #{context.dialect_tools}</tools>
[/INST] Ok </s> [/INST] Ok </s>

View File

@ -11,7 +11,7 @@ RSpec.describe DiscourseAi::Completions::Dialects::OrcaStyle do
llama2_classic_version = <<~TEXT llama2_classic_version = <<~TEXT
### System: ### System:
#{context.system_insts} #{context.system_insts}
#{described_class.tool_preamble} #{described_class.tool_preamble(include_array_tip: false)}
<tools> <tools>
#{context.dialect_tools}</tools> #{context.dialect_tools}</tools>
### User: ### User:
@ -28,7 +28,7 @@ RSpec.describe DiscourseAi::Completions::Dialects::OrcaStyle do
expected = +(<<~TEXT) expected = +(<<~TEXT)
### System: ### System:
#{context.system_insts} #{context.system_insts}
#{described_class.tool_preamble} #{described_class.tool_preamble(include_array_tip: false)}
<tools> <tools>
#{context.dialect_tools}</tools> #{context.dialect_tools}</tools>
### User: ### User:

View File

@ -7,6 +7,43 @@ RSpec.describe DiscourseAi::AiBot::Tools::GithubPullRequestDiff do
let(:bot_user) { Fabricate(:user) } let(:bot_user) { Fabricate(:user) }
let(:llm) { DiscourseAi::Completions::Llm.proxy("open_ai:gpt-4") } let(:llm) { DiscourseAi::Completions::Llm.proxy("open_ai:gpt-4") }
context "with #sort_and_shorten_diff" do
it "sorts and shortens the diff without dropping data" do
diff = <<~DIFF
diff --git a/src/lib.rs b/src/lib.rs
index b466edd..66b068f 100644
--- a/src/lib.rs
+++ b/src/lib.rs
this is a longer diff
this is a longer diff
this is a longer diff
diff --git a/tests/test_encoding.py b/tests/test_encoding.py
index 27b2192..9f31319 100644
--- a/tests/test_encoding.py
+++ b/tests/test_encoding.py
short diff
DIFF
sorted_diff = described_class.sort_and_shorten_diff(diff)
expect(sorted_diff).to eq(<<~DIFF)
diff --git a/tests/test_encoding.py b/tests/test_encoding.py
index 27b2192..9f31319 100644
--- a/tests/test_encoding.py
+++ b/tests/test_encoding.py
short diff
diff --git a/src/lib.rs b/src/lib.rs
index b466edd..66b068f 100644
--- a/src/lib.rs
+++ b/src/lib.rs
this is a longer diff
this is a longer diff
this is a longer diff
DIFF
end
end
context "with a valid pull request" do context "with a valid pull request" do
let(:repo) { "discourse/discourse-automation" } let(:repo) { "discourse/discourse-automation" }
let(:pull_id) { 253 } let(:pull_id) { 253 }