2023-04-19 10:55:59 -04:00
|
|
|
# frozen_string_literal: true
|
|
|
|
|
|
|
|
module DiscourseAi
|
|
|
|
class Tokenizer
|
|
|
|
def self.tokenizer
|
|
|
|
@@tokenizer ||=
|
|
|
|
Tokenizers.from_file("./plugins/discourse-ai/tokenizers/bert-base-uncased.json")
|
|
|
|
end
|
|
|
|
|
2023-05-06 06:31:53 -04:00
|
|
|
def self.tokenize(text)
|
|
|
|
tokenizer.encode(text).tokens
|
|
|
|
end
|
2023-04-19 10:55:59 -04:00
|
|
|
def self.size(text)
|
2023-05-06 06:31:53 -04:00
|
|
|
tokenize(text).size
|
2023-04-19 10:55:59 -04:00
|
|
|
end
|
|
|
|
end
|
|
|
|
end
|