2023-10-26 11:07:37 -04:00
|
|
|
# frozen_string_literal: true
|
|
|
|
|
|
|
|
module Jobs
|
|
|
|
class EmbeddingsBackfill < ::Jobs::Scheduled
|
2024-01-31 13:09:39 -05:00
|
|
|
every 5.minutes
|
2023-10-26 11:07:37 -04:00
|
|
|
sidekiq_options queue: "low"
|
|
|
|
cluster_concurrency 1
|
|
|
|
|
|
|
|
def execute(args)
|
|
|
|
return unless SiteSetting.ai_embeddings_enabled
|
|
|
|
|
|
|
|
limit = SiteSetting.ai_embeddings_backfill_batch_size
|
2024-01-31 08:38:47 -05:00
|
|
|
|
|
|
|
if limit > 50_000
|
|
|
|
limit = 50_000
|
|
|
|
Rails.logger.warn(
|
|
|
|
"Limiting backfill batch size to 50,000 to avoid OOM errors, reduce ai_embeddings_backfill_batch_size to avoid this warning",
|
|
|
|
)
|
|
|
|
end
|
|
|
|
|
2023-10-26 11:07:37 -04:00
|
|
|
rebaked = 0
|
|
|
|
|
2024-12-13 08:15:21 -05:00
|
|
|
vector_rep = DiscourseAi::Embeddings::VectorRepresentations::Base.current_representation
|
|
|
|
table_name = DiscourseAi::Embeddings::Schema::TOPICS_TABLE
|
2023-10-26 11:07:37 -04:00
|
|
|
|
|
|
|
topics =
|
|
|
|
Topic
|
|
|
|
.joins("LEFT JOIN #{table_name} ON #{table_name}.topic_id = topics.id")
|
|
|
|
.where(archetype: Archetype.default)
|
|
|
|
.where(deleted_at: nil)
|
2024-01-31 08:38:47 -05:00
|
|
|
.order("topics.bumped_at DESC")
|
2023-10-26 11:07:37 -04:00
|
|
|
|
2024-12-05 08:31:39 -05:00
|
|
|
rebaked += populate_topic_embeddings(vector_rep, topics.limit(limit - rebaked))
|
2023-10-26 11:07:37 -04:00
|
|
|
|
|
|
|
return if rebaked >= limit
|
|
|
|
|
|
|
|
# Then, we'll try to backfill embeddings for topics that have outdated
|
|
|
|
# embeddings, be it model or strategy version
|
2024-12-05 08:31:39 -05:00
|
|
|
relation = topics.where(<<~SQL).limit(limit - rebaked)
|
2023-10-26 11:07:37 -04:00
|
|
|
#{table_name}.model_version < #{vector_rep.version}
|
|
|
|
OR
|
2024-12-13 08:15:21 -05:00
|
|
|
#{table_name}.strategy_version < #{vector_rep.strategy_version}
|
2023-10-26 11:07:37 -04:00
|
|
|
SQL
|
2024-01-31 08:38:47 -05:00
|
|
|
|
|
|
|
rebaked += populate_topic_embeddings(vector_rep, relation)
|
2023-10-26 11:07:37 -04:00
|
|
|
|
|
|
|
return if rebaked >= limit
|
|
|
|
|
|
|
|
# Finally, we'll try to backfill embeddings for topics that have outdated
|
|
|
|
# embeddings due to edits or new replies. Here we only do 10% of the limit
|
2024-01-31 08:38:47 -05:00
|
|
|
relation =
|
2024-08-30 00:37:55 -04:00
|
|
|
topics
|
|
|
|
.where("#{table_name}.updated_at < ?", 6.hours.ago)
|
|
|
|
.where("#{table_name}.updated_at < topics.updated_at")
|
|
|
|
.limit((limit - rebaked) / 10)
|
2024-01-31 08:38:47 -05:00
|
|
|
|
2024-08-30 00:37:55 -04:00
|
|
|
populate_topic_embeddings(vector_rep, relation, force: true)
|
2023-12-29 10:28:45 -05:00
|
|
|
|
|
|
|
return if rebaked >= limit
|
|
|
|
|
2024-01-23 20:09:27 -05:00
|
|
|
return unless SiteSetting.ai_embeddings_per_post_enabled
|
|
|
|
|
2023-12-29 10:28:45 -05:00
|
|
|
# Now for posts
|
2024-12-13 08:15:21 -05:00
|
|
|
table_name = DiscourseAi::Embeddings::Schema::POSTS_TABLE
|
2024-12-05 08:31:39 -05:00
|
|
|
posts_batch_size = 1000
|
2023-12-29 10:28:45 -05:00
|
|
|
|
|
|
|
posts =
|
|
|
|
Post
|
|
|
|
.joins("LEFT JOIN #{table_name} ON #{table_name}.post_id = posts.id")
|
|
|
|
.where(deleted_at: nil)
|
2024-11-26 13:54:20 -05:00
|
|
|
.where(post_type: Post.types[:regular])
|
2023-12-29 10:28:45 -05:00
|
|
|
|
|
|
|
# First, we'll try to backfill embeddings for posts that have none
|
|
|
|
posts
|
|
|
|
.where("#{table_name}.post_id IS NULL")
|
2024-12-05 08:31:39 -05:00
|
|
|
.limit(limit - rebaked)
|
|
|
|
.pluck(:id)
|
|
|
|
.each_slice(posts_batch_size) do |batch|
|
|
|
|
vector_rep.gen_bulk_reprensentations(Post.where(id: batch))
|
|
|
|
rebaked += batch.length
|
2023-12-29 10:28:45 -05:00
|
|
|
end
|
|
|
|
|
|
|
|
return if rebaked >= limit
|
|
|
|
|
|
|
|
# Then, we'll try to backfill embeddings for posts that have outdated
|
|
|
|
# embeddings, be it model or strategy version
|
|
|
|
posts
|
|
|
|
.where(<<~SQL)
|
|
|
|
#{table_name}.model_version < #{vector_rep.version}
|
|
|
|
OR
|
2024-12-13 16:49:18 -05:00
|
|
|
#{table_name}.strategy_version < #{vector_rep.strategy_version}
|
2023-12-29 10:28:45 -05:00
|
|
|
SQL
|
2024-12-05 08:31:39 -05:00
|
|
|
.limit(limit - rebaked)
|
|
|
|
.pluck(:id)
|
|
|
|
.each_slice(posts_batch_size) do |batch|
|
|
|
|
vector_rep.gen_bulk_reprensentations(Post.where(id: batch))
|
|
|
|
rebaked += batch.length
|
2023-12-29 10:28:45 -05:00
|
|
|
end
|
|
|
|
|
|
|
|
return if rebaked >= limit
|
|
|
|
|
|
|
|
# Finally, we'll try to backfill embeddings for posts that have outdated
|
|
|
|
# embeddings due to edits. Here we only do 10% of the limit
|
2024-12-05 08:31:39 -05:00
|
|
|
posts
|
|
|
|
.where("#{table_name}.updated_at < ?", 7.days.ago)
|
|
|
|
.order("random()")
|
|
|
|
.limit((limit - rebaked) / 10)
|
|
|
|
.pluck(:id)
|
|
|
|
.each_slice(posts_batch_size) do |batch|
|
|
|
|
vector_rep.gen_bulk_reprensentations(Post.where(id: batch))
|
|
|
|
rebaked += batch.length
|
|
|
|
end
|
2023-10-26 11:07:37 -04:00
|
|
|
|
|
|
|
rebaked
|
|
|
|
end
|
2024-01-31 08:38:47 -05:00
|
|
|
|
|
|
|
private
|
|
|
|
|
2024-08-30 00:37:55 -04:00
|
|
|
def populate_topic_embeddings(vector_rep, topics, force: false)
|
2024-01-31 08:38:47 -05:00
|
|
|
done = 0
|
2024-08-30 00:37:55 -04:00
|
|
|
|
2024-12-13 08:15:21 -05:00
|
|
|
topics =
|
|
|
|
topics.where("#{DiscourseAi::Embeddings::Schema::TOPICS_TABLE}.topic_id IS NULL") if !force
|
2024-08-30 00:37:55 -04:00
|
|
|
|
|
|
|
ids = topics.pluck("topics.id")
|
2024-11-26 12:12:32 -05:00
|
|
|
batch_size = 1000
|
2024-01-31 08:38:47 -05:00
|
|
|
|
2024-11-26 12:12:32 -05:00
|
|
|
ids.each_slice(batch_size) do |batch|
|
|
|
|
vector_rep.gen_bulk_reprensentations(Topic.where(id: batch).order("topics.bumped_at DESC"))
|
|
|
|
done += batch.length
|
2024-01-31 08:38:47 -05:00
|
|
|
end
|
2024-11-26 12:12:32 -05:00
|
|
|
|
2024-01-31 08:38:47 -05:00
|
|
|
done
|
|
|
|
end
|
2023-10-26 11:07:37 -04:00
|
|
|
end
|
|
|
|
end
|