discourse-ai/lib/tokenizer/open_ai_tokenizer.rb

34 lines
888 B
Ruby

# frozen_string_literal: true
module DiscourseAi
module Tokenizer
class OpenAiTokenizer < BasicTokenizer
class << self
def tokenizer
@@tokenizer ||= Tiktoken.get_encoding("cl100k_base")
end
def tokenize(text)
tokenizer.encode(text)
end
def truncate(text, max_length)
# Fast track the common case where the text is already short enough.
return text if text.size < max_length
tokenizer.decode(tokenize(text).take(max_length))
rescue Tiktoken::UnicodeError
max_length = max_length - 1
retry
end
def can_expand_tokens?(text, addition, max_length)
return true if text.size + addition.size < max_length
tokenizer.encode(text).length + tokenizer.encode(addition).length < max_length
end
end
end
end
end