FEATURE: Update OpenAI tokenizer to GPT-4o and later (#1467)

This commit is contained in:
Rafael dos Santos Silva 2025-06-26 15:26:09 -03:00 committed by GitHub
parent 2fe99a0bec
commit a40e2d3156
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 7 additions and 29 deletions

View File

@ -1,13 +0,0 @@
# frozen_string_literal: true
module DiscourseAi
module Tokenizer
class OpenAiGpt4oTokenizer < OpenAiTokenizer
class << self
def tokenizer
@@tokenizer ||= Tiktoken.get_encoding("o200k_base")
end
end
end
end
end

View File

@ -5,7 +5,7 @@ module DiscourseAi
class OpenAiTokenizer < BasicTokenizer
class << self
def tokenizer
@@tokenizer ||= Tiktoken.get_encoding("cl100k_base")
@@tokenizer ||= Tiktoken.get_encoding("o200k_base")
end
def tokenize(text)

View File

@ -99,7 +99,8 @@ RSpec.describe DiscourseAi::Completions::Dialects::Dialect do
end
it "limits the system message to 60% of available tokens" do
prompt = DiscourseAi::Completions::Prompt.new("I'm a system message consisting of 10 tokens")
prompt =
DiscourseAi::Completions::Prompt.new("I'm a system message consisting of 10 tokens okay")
prompt.push(type: :user, content: five_token_msg)
dialect = TestDialect.new(prompt, llm_model)
@ -109,7 +110,7 @@ RSpec.describe DiscourseAi::Completions::Dialects::Dialect do
expect(trimmed).to eq(
[
{ type: :system, content: "I'm a system message consisting of 10" },
{ type: :system, content: "I'm a system message consisting of 10 tokens" },
{ type: :user, content: five_token_msg },
],
)

View File

@ -18,7 +18,7 @@ class OpenAiMock < EndpointMock
model: "gpt-3.5-turbo-0301",
usage: {
prompt_tokens: 8,
completion_tokens: 13,
completion_tokens: 12,
total_tokens: 499,
},
choices: [

View File

@ -79,7 +79,7 @@ describe DiscourseAi::Tokenizer::OpenAiTokenizer do
it "truncates a sentence successfully at a multibyte unicode character" do
sentence = "foo bar 👨🏿‍👩🏿‍👧🏿‍👧🏿 baz qux quux corge grault garply waldo fred plugh xyzzy thud"
expect(described_class.truncate(sentence, 7)).to eq("foo bar 👨🏿")
expect(described_class.truncate(sentence, 7)).to eq("foo bar 👨🏿")
end
it "truncates unicode characters properly when they use more than one token per char" do
@ -104,17 +104,7 @@ describe DiscourseAi::Tokenizer::OpenAiTokenizer do
end
it "handles unicode characters properly when they use more than one token per char" do
expect(described_class.below_limit?("我喜欢吃比萨萨", 10)).to eq(false)
end
end
end
describe DiscourseAi::Tokenizer::OpenAiGpt4oTokenizer do
describe "#size" do
describe "returns a token count" do
it "for a sentence with punctuation and capitalization and numbers" do
expect(described_class.size("Hello, World! 123")).to eq(6)
end
expect(described_class.below_limit?("我喜欢吃比萨萨", 6)).to eq(false)
end
end
end