FEATURE: Update OpenAI tokenizer to GPT-4o and later (#1467)

2025-07-30 09:53:27 +00:00 · 2025-06-26 15:26:09 -03:00 · 2025-06-26 15:26:09 -03:00 · a40e2d3156
commit a40e2d3156
parent 2fe99a0bec
5 changed files with 7 additions and 29 deletions
--- a/lib/tokenizer/open_ai_gpt4o_tokenizer.rb
+++ b/lib/tokenizer/open_ai_gpt4o_tokenizer.rb
@ -1,13 +0,0 @@
-# frozen_string_literal: true
-
-module DiscourseAi
-  module Tokenizer
-    class OpenAiGpt4oTokenizer < OpenAiTokenizer
-      class << self
-        def tokenizer
-          @@tokenizer ||= Tiktoken.get_encoding("o200k_base")
-        end
-      end
-    end
-  end
-end
--- a/lib/tokenizer/open_ai_tokenizer.rb
+++ b/lib/tokenizer/open_ai_tokenizer.rb
@ -5,7 +5,7 @@ module DiscourseAi
    class OpenAiTokenizer < BasicTokenizer
      class << self
        def tokenizer
-          @@tokenizer ||= Tiktoken.get_encoding("cl100k_base")
+          @@tokenizer ||= Tiktoken.get_encoding("o200k_base")
        end

        def tokenize(text)
--- a/spec/lib/completions/dialects/dialect_spec.rb
+++ b/spec/lib/completions/dialects/dialect_spec.rb
@ -99,7 +99,8 @@ RSpec.describe DiscourseAi::Completions::Dialects::Dialect do
    end

    it "limits the system message to 60% of available tokens" do
-      prompt = DiscourseAi::Completions::Prompt.new("I'm a system message consisting of 10 tokens")
+      prompt =
+        DiscourseAi::Completions::Prompt.new("I'm a system message consisting of 10 tokens okay")
      prompt.push(type: :user, content: five_token_msg)

      dialect = TestDialect.new(prompt, llm_model)
@ -109,7 +110,7 @@ RSpec.describe DiscourseAi::Completions::Dialects::Dialect do

      expect(trimmed).to eq(
        [
-          { type: :system, content: "I'm a system message consisting of 10" },
+          { type: :system, content: "I'm a system message consisting of 10 tokens" },
          { type: :user, content: five_token_msg },
        ],
      )
--- a/spec/lib/completions/endpoints/open_ai_spec.rb
+++ b/spec/lib/completions/endpoints/open_ai_spec.rb
@ -18,7 +18,7 @@ class OpenAiMock < EndpointMock
      model: "gpt-3.5-turbo-0301",
      usage: {
        prompt_tokens: 8,
-        completion_tokens: 13,
+        completion_tokens: 12,
        total_tokens: 499,
      },
      choices: [
--- a/spec/shared/tokenizer_spec.rb
+++ b/spec/shared/tokenizer_spec.rb
@ -79,7 +79,7 @@ describe DiscourseAi::Tokenizer::OpenAiTokenizer do

    it "truncates a sentence successfully at a multibyte unicode character" do
      sentence = "foo bar 👨🏿‍👩🏿‍👧🏿‍👧🏿 baz qux quux corge grault garply waldo fred plugh xyzzy thud"
-      expect(described_class.truncate(sentence, 7)).to eq("foo bar 👨🏿")
+      expect(described_class.truncate(sentence, 7)).to eq("foo bar 👨🏿‍")
    end

    it "truncates unicode characters properly when they use more than one token per char" do
@ -104,17 +104,7 @@ describe DiscourseAi::Tokenizer::OpenAiTokenizer do
    end

    it "handles unicode characters properly when they use more than one token per char" do
-      expect(described_class.below_limit?("我喜欢吃比萨萨", 10)).to eq(false)
-    end
-  end
-end
-
-describe DiscourseAi::Tokenizer::OpenAiGpt4oTokenizer do
-  describe "#size" do
-    describe "returns a token count" do
-      it "for a sentence with punctuation and capitalization and numbers" do
-        expect(described_class.size("Hello, World! 123")).to eq(6)
-      end
+      expect(described_class.below_limit?("我喜欢吃比萨萨", 6)).to eq(false)
    end
  end
 end