discourse-ai/lib/tokenizer/open_ai_tokenizer.rb
Rafael dos Santos Silva 3b8f900486
FIX: Handle unicode on tokenizer (#515)
* FIX: Handle unicode on tokenizer

Our fast track code broke when strings had characters who are longer in tokens than
in UTF-8.

Admins can set `DISCOURSE_AI_STRICT_TOKEN_COUNTING: true` in app.yml to ensure token counting is strict, even if slower.


Co-authored-by: wozulong <sidle.pax_0e@icloud.com>
2024-03-14 17:33:30 -03:00

39 lines
1.1 KiB
Ruby

# frozen_string_literal: true
module DiscourseAi
module Tokenizer
class OpenAiTokenizer < BasicTokenizer
class << self
def tokenizer
@@tokenizer ||= Tiktoken.get_encoding("cl100k_base")
end
def tokenize(text)
tokenizer.encode(text)
end
def truncate(text, max_length)
# fast track common case, /2 to handle unicode chars
# than can take more than 1 token per char
return text if !SiteSetting.ai_strict_token_counting && text.size < max_length / 2
tokenizer.decode(tokenize(text).take(max_length))
rescue Tiktoken::UnicodeError
max_length = max_length - 1
retry
end
def can_expand_tokens?(text, addition, max_length)
# fast track common case, /2 to handle unicode chars
# than can take more than 1 token per char
if !SiteSetting.ai_strict_token_counting && text.size + addition.size < max_length / 2
return true
end
tokenizer.encode(text).length + tokenizer.encode(addition).length < max_length
end
end
end
end
end