2023-11-28 23:17:46 -05:00
|
|
|
# frozen_string_literal: true
|
|
|
|
|
|
|
|
module DiscourseAi
|
|
|
|
module Tokenizer
|
|
|
|
class BasicTokenizer
|
|
|
|
class << self
|
2024-05-16 08:50:22 -04:00
|
|
|
def available_llm_tokenizers
|
|
|
|
[
|
|
|
|
DiscourseAi::Tokenizer::AnthropicTokenizer,
|
|
|
|
DiscourseAi::Tokenizer::Llama3Tokenizer,
|
|
|
|
DiscourseAi::Tokenizer::MixtralTokenizer,
|
|
|
|
DiscourseAi::Tokenizer::OpenAiTokenizer,
|
|
|
|
]
|
|
|
|
end
|
|
|
|
|
2023-11-28 23:17:46 -05:00
|
|
|
def tokenizer
|
|
|
|
raise NotImplementedError
|
|
|
|
end
|
|
|
|
|
|
|
|
def tokenize(text)
|
|
|
|
tokenizer.encode(text).tokens
|
|
|
|
end
|
|
|
|
|
|
|
|
def size(text)
|
|
|
|
tokenize(text).size
|
|
|
|
end
|
|
|
|
|
2024-04-12 09:32:46 -04:00
|
|
|
def decode(token_ids)
|
|
|
|
tokenizer.decode(token_ids)
|
|
|
|
end
|
|
|
|
|
|
|
|
def encode(tokens)
|
|
|
|
tokenizer.encode(tokens).ids
|
|
|
|
end
|
|
|
|
|
2023-11-28 23:17:46 -05:00
|
|
|
def truncate(text, max_length)
|
2024-03-14 16:33:30 -04:00
|
|
|
# fast track common case, /2 to handle unicode chars
|
|
|
|
# than can take more than 1 token per char
|
|
|
|
return text if !SiteSetting.ai_strict_token_counting && text.size < max_length / 2
|
2023-11-28 23:17:46 -05:00
|
|
|
tokenizer.decode(tokenizer.encode(text).ids.take(max_length))
|
|
|
|
end
|
|
|
|
|
|
|
|
def can_expand_tokens?(text, addition, max_length)
|
2024-03-14 16:33:30 -04:00
|
|
|
# fast track common case, /2 to handle unicode chars
|
|
|
|
# than can take more than 1 token per char
|
|
|
|
if !SiteSetting.ai_strict_token_counting && text.size + addition.size < max_length / 2
|
|
|
|
return true
|
|
|
|
end
|
2023-11-28 23:17:46 -05:00
|
|
|
|
|
|
|
tokenizer.encode(text).ids.length + tokenizer.encode(addition).ids.length < max_length
|
|
|
|
end
|
|
|
|
end
|
|
|
|
end
|
|
|
|
end
|
|
|
|
end
|