discourse-ai/lib/shared/tokenizer/tokenizer.rb

56 lines
1.4 KiB
Ruby

# frozen_string_literal: true
module DiscourseAi
module Tokenizer
class BasicTokenizer
def self.tokenizer
raise NotImplementedError
end
def self.tokenize(text)
tokenizer.encode(text).tokens
end
def self.size(text)
tokenize(text).size
end
def self.truncate(text, max_length)
# Fast track the common case where the text is already short enough.
return text if text.size < max_length
tokenizer.decode(tokenizer.encode(text).ids.take(max_length))
end
end
class BertTokenizer < BasicTokenizer
def self.tokenizer
@@tokenizer ||=
Tokenizers.from_file("./plugins/discourse-ai/tokenizers/bert-base-uncased.json")
end
end
class AnthropicTokenizer < BasicTokenizer
def self.tokenizer
@@tokenizer ||=
Tokenizers.from_file("./plugins/discourse-ai/tokenizers/claude-v1-tokenization.json")
end
end
class OpenAiTokenizer < BasicTokenizer
def self.tokenizer
@@tokenizer ||= Tiktoken.get_encoding("cl100k_base")
end
def self.tokenize(text)
tokenizer.encode(text)
end
def self.truncate(text, max_length)
# Fast track the common case where the text is already short enough.
return text if text.size < max_length
tokenizer.decode(tokenize(text).take(max_length))
end
end
end
end