diff --git a/lib/tokenizer/open_ai_gpt4o_tokenizer.rb b/lib/tokenizer/open_ai_gpt4o_tokenizer.rb new file mode 100644 index 00000000..bf7a28be --- /dev/null +++ b/lib/tokenizer/open_ai_gpt4o_tokenizer.rb @@ -0,0 +1,13 @@ +# frozen_string_literal: true + +module DiscourseAi + module Tokenizer + class OpenAiGpt4oTokenizer < OpenAiTokenizer + class << self + def tokenizer + @@tokenizer ||= Tiktoken.get_encoding("o200k_base") + end + end + end + end +end diff --git a/plugin.rb b/plugin.rb index 1b5ad2f9..74718993 100644 --- a/plugin.rb +++ b/plugin.rb @@ -9,7 +9,7 @@ # required_version: 2.7.0 gem "tokenizers", "0.4.4" -gem "tiktoken_ruby", "0.0.7" +gem "tiktoken_ruby", "0.0.9" enabled_site_setting :discourse_ai_enabled diff --git a/spec/shared/tokenizer_spec.rb b/spec/shared/tokenizer_spec.rb index e46b6d9c..92751624 100644 --- a/spec/shared/tokenizer_spec.rb +++ b/spec/shared/tokenizer_spec.rb @@ -109,6 +109,16 @@ describe DiscourseAi::Tokenizer::OpenAiTokenizer do end end +describe DiscourseAi::Tokenizer::OpenAiGpt4oTokenizer do + describe "#size" do + describe "returns a token count" do + it "for a sentence with punctuation and capitalization and numbers" do + expect(described_class.size("Hello, World! 123")).to eq(6) + end + end + end +end + describe DiscourseAi::Tokenizer::AllMpnetBaseV2Tokenizer do describe "#size" do describe "returns a token count" do