FEATURE: GPT4o Tokenizer (#721)

This commit is contained in:
Rafael dos Santos Silva 2024-07-22 15:26:14 -03:00 committed by GitHub
parent 7f2c3a1ab9
commit 3502f0f1cd
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 24 additions and 1 deletions

View File

@ -0,0 +1,13 @@
# frozen_string_literal: true
module DiscourseAi
module Tokenizer
class OpenAiGpt4oTokenizer < OpenAiTokenizer
class << self
def tokenizer
@@tokenizer ||= Tiktoken.get_encoding("o200k_base")
end
end
end
end
end

View File

@ -9,7 +9,7 @@
# required_version: 2.7.0 # required_version: 2.7.0
gem "tokenizers", "0.4.4" gem "tokenizers", "0.4.4"
gem "tiktoken_ruby", "0.0.7" gem "tiktoken_ruby", "0.0.9"
enabled_site_setting :discourse_ai_enabled enabled_site_setting :discourse_ai_enabled

View File

@ -109,6 +109,16 @@ describe DiscourseAi::Tokenizer::OpenAiTokenizer do
end end
end end
describe DiscourseAi::Tokenizer::OpenAiGpt4oTokenizer do
describe "#size" do
describe "returns a token count" do
it "for a sentence with punctuation and capitalization and numbers" do
expect(described_class.size("Hello, World! 123")).to eq(6)
end
end
end
end
describe DiscourseAi::Tokenizer::AllMpnetBaseV2Tokenizer do describe DiscourseAi::Tokenizer::AllMpnetBaseV2Tokenizer do
describe "#size" do describe "#size" do
describe "returns a token count" do describe "returns a token count" do