Fixes for embeddings and truncate (#67)

This commit is contained in:
Rafael dos Santos Silva 2023-05-17 20:21:28 -03:00 committed by GitHub
parent 9ae8f86850
commit 739b314312
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 7 additions and 1 deletions

View File

@ -14,6 +14,9 @@ module DiscourseAi
tokenize(text).size tokenize(text).size
end end
def self.truncate(text, max_length) def self.truncate(text, max_length)
# Fast track the common case where the text is already short enough.
return text if text.size < max_length
tokenizer.decode(tokenizer.encode(text).ids.take(max_length)) tokenizer.decode(tokenizer.encode(text).ids.take(max_length))
end end
end end
@ -42,6 +45,9 @@ module DiscourseAi
end end
def self.truncate(text, max_length) def self.truncate(text, max_length)
# Fast track the common case where the text is already short enough.
return text if text.size < max_length
tokenizer.decode(tokenize(text).take(max_length)) tokenizer.decode(tokenize(text).take(max_length))
end end
end end

View File

@ -17,7 +17,7 @@ task "ai:embeddings:create_table" => [:environment] do
end end
desc "Backfill embeddings for all topics" desc "Backfill embeddings for all topics"
task "ai:embeddings:backfill", [:start_topic] => [:environment] do task "ai:embeddings:backfill", [:start_topic] => [:environment] do |_, args|
public_categories = Category.where(read_restricted: false).pluck(:id) public_categories = Category.where(read_restricted: false).pluck(:id)
topic_embeddings = DiscourseAi::Embeddings::Topic.new topic_embeddings = DiscourseAi::Embeddings::Topic.new
Topic Topic