From 739b314312d4372eeb325884268bd41078b4a95a Mon Sep 17 00:00:00 2001 From: Rafael dos Santos Silva Date: Wed, 17 May 2023 20:21:28 -0300 Subject: [PATCH] Fixes for embeddings and truncate (#67) --- lib/shared/tokenizer/tokenizer.rb | 6 ++++++ lib/tasks/modules/embeddings/database.rake | 2 +- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/lib/shared/tokenizer/tokenizer.rb b/lib/shared/tokenizer/tokenizer.rb index c7afeb0a..fc66c4e7 100644 --- a/lib/shared/tokenizer/tokenizer.rb +++ b/lib/shared/tokenizer/tokenizer.rb @@ -14,6 +14,9 @@ module DiscourseAi tokenize(text).size end def self.truncate(text, max_length) + # Fast track the common case where the text is already short enough. + return text if text.size < max_length + tokenizer.decode(tokenizer.encode(text).ids.take(max_length)) end end @@ -42,6 +45,9 @@ module DiscourseAi end def self.truncate(text, max_length) + # Fast track the common case where the text is already short enough. + return text if text.size < max_length + tokenizer.decode(tokenize(text).take(max_length)) end end diff --git a/lib/tasks/modules/embeddings/database.rake b/lib/tasks/modules/embeddings/database.rake index a0ea1935..fccb1592 100644 --- a/lib/tasks/modules/embeddings/database.rake +++ b/lib/tasks/modules/embeddings/database.rake @@ -17,7 +17,7 @@ task "ai:embeddings:create_table" => [:environment] do end desc "Backfill embeddings for all topics" -task "ai:embeddings:backfill", [:start_topic] => [:environment] do +task "ai:embeddings:backfill", [:start_topic] => [:environment] do |_, args| public_categories = Category.where(read_restricted: false).pluck(:id) topic_embeddings = DiscourseAi::Embeddings::Topic.new Topic