discourse-ai/lib/tokenizer/open_ai_tokenizer.rb

# frozen_string_literal: true

module DiscourseAi
  module Tokenizer
    class OpenAiTokenizer < BasicTokenizer
      class << self
        def tokenizer
          @@tokenizer ||= Tiktoken.get_encoding("cl100k_base")
        end

        def tokenize(text)
          tokenizer.encode(text)
        end

        def encode(text)
          tokenizer.encode(text)
        end

        def decode(token_ids)
          tokenizer.decode(token_ids)
        end

        def truncate(text, max_length)
          # fast track common case, /2 to handle unicode chars
          # than can take more than 1 token per char
          return text if !SiteSetting.ai_strict_token_counting && text.size < max_length / 2

          tokenizer.decode(tokenize(text).take(max_length))
        rescue Tiktoken::UnicodeError
          max_length = max_length - 1
          retry
        end

        def can_expand_tokens?(text, addition, max_length)
          # fast track common case, /2 to handle unicode chars
          # than can take more than 1 token per char
          if !SiteSetting.ai_strict_token_counting && text.size + addition.size < max_length / 2
            return true
          end

          tokenizer.encode(text).length + tokenizer.encode(addition).length < max_length
        end
      end
    end
  end
end