FIX: Limit system message size to 60% of available tokens. (#714)

Using RAG fragments can lead to considerably big system messages, which becomes problematic when models have a smaller context window.

Before this change, we only look at the rest of the conversation to make sure we don't surpass the limit, which could lead to two unwanted scenarios when having large system messages:

All other messages are excluded due to size.
The system message already exceeds the limit.

As a result, I'm putting a hard-limit of 60% of available tokens. We don't want to aggresively truncate because if rag fragments are included, the system message contains a lot of context to improve the model response, but we also want to make room for the recent messages in the conversation.
This commit is contained in:
Roman Rizzi 2024-07-12 15:09:01 -03:00 committed by GitHub
parent 5c1ab85583
commit 0a8195242b
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 36 additions and 11 deletions

View File

@ -95,7 +95,17 @@ module DiscourseAi
range = (0..-1)
if messages.dig(0, :type) == :system
max_system_tokens = prompt_limit * 0.6
system_message = messages[0]
system_size = calculate_message_token(system_message)
if system_size > max_system_tokens
system_message[:content] = tokenizer.truncate(
system_message[:content],
max_system_tokens,
)
end
trimmed_messages << system_message
current_token_count += calculate_message_token(system_message)
range = (1..-1)

View File

@ -8,22 +8,20 @@ class TestDialect < DiscourseAi::Completions::Dialects::Dialect
end
def tokenizer
Class.new do
def self.size(str)
str.length
end
end
DiscourseAi::Tokenizer::OpenAiTokenizer
end
end
RSpec.describe DiscourseAi::Completions::Dialects::Dialect do
describe "#trim_messages" do
let(:five_token_msg) { "This represents five tokens." }
it "should trim tool messages if tool_calls are trimmed" do
prompt = DiscourseAi::Completions::Prompt.new("12345")
prompt.push(type: :user, content: "12345")
prompt.push(type: :tool_call, content: "12345", id: 1)
prompt.push(type: :tool, content: "12345", id: 1)
prompt.push(type: :user, content: "12345")
prompt = DiscourseAi::Completions::Prompt.new(five_token_msg)
prompt.push(type: :user, content: five_token_msg)
prompt.push(type: :tool_call, content: five_token_msg, id: 1)
prompt.push(type: :tool, content: five_token_msg, id: 1)
prompt.push(type: :user, content: five_token_msg)
dialect = TestDialect.new(prompt, "test")
dialect.max_prompt_tokens = 15 # fits the user messages and the tool_call message
@ -31,7 +29,24 @@ RSpec.describe DiscourseAi::Completions::Dialects::Dialect do
trimmed = dialect.trim(prompt.messages)
expect(trimmed).to eq(
[{ type: :system, content: "12345" }, { type: :user, content: "12345" }],
[{ type: :system, content: five_token_msg }, { type: :user, content: five_token_msg }],
)
end
it "limits the system message to 60% of available tokens" do
prompt = DiscourseAi::Completions::Prompt.new("I'm a system message consisting of 10 tokens")
prompt.push(type: :user, content: five_token_msg)
dialect = TestDialect.new(prompt, "test")
dialect.max_prompt_tokens = 15
trimmed = dialect.trim(prompt.messages)
expect(trimmed).to eq(
[
{ type: :system, content: "I'm a system message consisting of 10" },
{ type: :user, content: five_token_msg },
],
)
end
end