From 3502f0f1cd6a89c3c83264cfcee3f17d1d196e34 Mon Sep 17 00:00:00 2001 From: Rafael dos Santos Silva Date: Mon, 22 Jul 2024 15:26:14 -0300 Subject: [PATCH] FEATURE: GPT4o Tokenizer (#721) --- lib/tokenizer/open_ai_gpt4o_tokenizer.rb | 13 +++++++++++++ plugin.rb | 2 +- spec/shared/tokenizer_spec.rb | 10 ++++++++++ 3 files changed, 24 insertions(+), 1 deletion(-) create mode 100644 lib/tokenizer/open_ai_gpt4o_tokenizer.rb diff --git a/lib/tokenizer/open_ai_gpt4o_tokenizer.rb b/lib/tokenizer/open_ai_gpt4o_tokenizer.rb new file mode 100644 index 00000000..bf7a28be --- /dev/null +++ b/lib/tokenizer/open_ai_gpt4o_tokenizer.rb @@ -0,0 +1,13 @@ +# frozen_string_literal: true + +module DiscourseAi + module Tokenizer + class OpenAiGpt4oTokenizer < OpenAiTokenizer + class << self + def tokenizer + @@tokenizer ||= Tiktoken.get_encoding("o200k_base") + end + end + end + end +end diff --git a/plugin.rb b/plugin.rb index 1b5ad2f9..74718993 100644 --- a/plugin.rb +++ b/plugin.rb @@ -9,7 +9,7 @@ # required_version: 2.7.0 gem "tokenizers", "0.4.4" -gem "tiktoken_ruby", "0.0.7" +gem "tiktoken_ruby", "0.0.9" enabled_site_setting :discourse_ai_enabled diff --git a/spec/shared/tokenizer_spec.rb b/spec/shared/tokenizer_spec.rb index e46b6d9c..92751624 100644 --- a/spec/shared/tokenizer_spec.rb +++ b/spec/shared/tokenizer_spec.rb @@ -109,6 +109,16 @@ describe DiscourseAi::Tokenizer::OpenAiTokenizer do end end +describe DiscourseAi::Tokenizer::OpenAiGpt4oTokenizer do + describe "#size" do + describe "returns a token count" do + it "for a sentence with punctuation and capitalization and numbers" do + expect(described_class.size("Hello, World! 123")).to eq(6) + end + end + end +end + describe DiscourseAi::Tokenizer::AllMpnetBaseV2Tokenizer do describe "#size" do describe "returns a token count" do