diff --git a/lib/shared/tokenizer/tokenizer.rb b/lib/shared/tokenizer/tokenizer.rb index c7afeb0a..fc66c4e7 100644 --- a/lib/shared/tokenizer/tokenizer.rb +++ b/lib/shared/tokenizer/tokenizer.rb @@ -14,6 +14,9 @@ module DiscourseAi tokenize(text).size end def self.truncate(text, max_length) + # Fast track the common case where the text is already short enough. + return text if text.size < max_length + tokenizer.decode(tokenizer.encode(text).ids.take(max_length)) end end @@ -42,6 +45,9 @@ module DiscourseAi end def self.truncate(text, max_length) + # Fast track the common case where the text is already short enough. + return text if text.size < max_length + tokenizer.decode(tokenize(text).take(max_length)) end end diff --git a/lib/tasks/modules/embeddings/database.rake b/lib/tasks/modules/embeddings/database.rake index a0ea1935..fccb1592 100644 --- a/lib/tasks/modules/embeddings/database.rake +++ b/lib/tasks/modules/embeddings/database.rake @@ -17,7 +17,7 @@ task "ai:embeddings:create_table" => [:environment] do end desc "Backfill embeddings for all topics" -task "ai:embeddings:backfill", [:start_topic] => [:environment] do +task "ai:embeddings:backfill", [:start_topic] => [:environment] do |_, args| public_categories = Category.where(read_restricted: false).pluck(:id) topic_embeddings = DiscourseAi::Embeddings::Topic.new Topic