diff --git a/config/locales/server.en.yml b/config/locales/server.en.yml index c498e366..b83b86b1 100644 --- a/config/locales/server.en.yml +++ b/config/locales/server.en.yml @@ -59,6 +59,7 @@ en: ai_embeddings_generate_for_pms: "Generate embeddings for personal messages." ai_embeddings_semantic_related_topics_enabled: "Use Semantic Search for related topics." ai_embeddings_semantic_related_topics: "Maximum number of topics to show in related topic section." + ai_embeddings_backfill_batch_size: "Number of embeddings to backfill every 15 minutes." ai_embeddings_pg_connection_string: "PostgreSQL connection string for the embeddings module. Needs pgvector extension enabled and a series of tables created. See docs for more info." ai_embeddings_semantic_search_enabled: "Enable full-page semantic search." ai_embeddings_semantic_related_include_closed_topics: "Include closed topics in semantic search results" diff --git a/config/settings.yml b/config/settings.yml index 6b1514b1..a5a5fe59 100644 --- a/config/settings.yml +++ b/config/settings.yml @@ -199,6 +199,9 @@ discourse_ai: client: true ai_embeddings_semantic_related_topics: 5 ai_embeddings_semantic_related_include_closed_topics: true + ai_embeddings_backfill_batch_size: + default: 250 + hidden: true ai_embeddings_pg_connection_string: default: "" hidden: true diff --git a/lib/modules/embeddings/jobs/scheduled/embeddings_backfill.rb b/lib/modules/embeddings/jobs/scheduled/embeddings_backfill.rb new file mode 100644 index 00000000..bf8b6d75 --- /dev/null +++ b/lib/modules/embeddings/jobs/scheduled/embeddings_backfill.rb @@ -0,0 +1,65 @@ +# frozen_string_literal: true + +module Jobs + class EmbeddingsBackfill < ::Jobs::Base + every 15.minutes + sidekiq_options queue: "low" + cluster_concurrency 1 + + def execute(args) + return unless SiteSetting.ai_embeddings_enabled + + limit = SiteSetting.ai_embeddings_backfill_batch_size + rebaked = 0 + + strategy = DiscourseAi::Embeddings::Strategies::Truncation.new + vector_rep = + DiscourseAi::Embeddings::VectorRepresentations::Base.current_representation(strategy) + table_name = vector_rep.table_name + + topics = + Topic + .joins("LEFT JOIN #{table_name} ON #{table_name}.topic_id = topics.id") + .where(deleted_at: nil) + .order("#{table_name}.updated_at ASC NULLS FIRST, topics.id DESC") + .limit(limit - rebaked) + + # First, we'll try to backfill embeddings for topics that have none + topics + .where("#{table_name}.topic_id IS NULL") + .find_each do |t| + vector_rep.generate_topic_representation_from(t) + rebaked += 1 + end + + return if rebaked >= limit + + # Then, we'll try to backfill embeddings for topics that have outdated + # embeddings, be it model or strategy version + topics + .where(<<~SQL) + #{table_name}.model_version < #{vector_rep.version} + OR + #{table_name}.strategy_version < #{strategy.version} + SQL + .find_each do |t| + vector_rep.generate_topic_representation_from(t) + rebaked += 1 + end + + return if rebaked >= limit + + # Finally, we'll try to backfill embeddings for topics that have outdated + # embeddings due to edits or new replies. Here we only do 10% of the limit + topics + .reorder("random()") + .limit((limit - rebaked) / 10) + .find_each do |t| + vector_rep.generate_topic_representation_from(t) + rebaked += 1 + end + + return rebaked + end + end +end