FEATURE: Automatic embeddings backfill

This commit is contained in:
Rafael dos Santos Silva 2023-10-23 16:54:24 -03:00
parent 0a76bea076
commit 7174fff7f0
No known key found for this signature in database
3 changed files with 69 additions and 0 deletions

View File

@ -59,6 +59,7 @@ en:
ai_embeddings_generate_for_pms: "Generate embeddings for personal messages."
ai_embeddings_semantic_related_topics_enabled: "Use Semantic Search for related topics."
ai_embeddings_semantic_related_topics: "Maximum number of topics to show in related topic section."
ai_embeddings_backfill_batch_size: "Number of embeddings to backfill every 15 minutes."
ai_embeddings_pg_connection_string: "PostgreSQL connection string for the embeddings module. Needs pgvector extension enabled and a series of tables created. See docs for more info."
ai_embeddings_semantic_search_enabled: "Enable full-page semantic search."
ai_embeddings_semantic_related_include_closed_topics: "Include closed topics in semantic search results"

View File

@ -199,6 +199,9 @@ discourse_ai:
client: true
ai_embeddings_semantic_related_topics: 5
ai_embeddings_semantic_related_include_closed_topics: true
ai_embeddings_backfill_batch_size:
default: 250
hidden: true
ai_embeddings_pg_connection_string:
default: ""
hidden: true

View File

@ -0,0 +1,65 @@
# frozen_string_literal: true
module Jobs
class EmbeddingsBackfill < ::Jobs::Base
every 15.minutes
sidekiq_options queue: "low"
cluster_concurrency 1
def execute(args)
return unless SiteSetting.ai_embeddings_enabled
limit = SiteSetting.ai_embeddings_backfill_batch_size
rebaked = 0
strategy = DiscourseAi::Embeddings::Strategies::Truncation.new
vector_rep =
DiscourseAi::Embeddings::VectorRepresentations::Base.current_representation(strategy)
table_name = vector_rep.table_name
topics =
Topic
.joins("LEFT JOIN #{table_name} ON #{table_name}.topic_id = topics.id")
.where(deleted_at: nil)
.order("#{table_name}.updated_at ASC NULLS FIRST, topics.id DESC")
.limit(limit - rebaked)
# First, we'll try to backfill embeddings for topics that have none
topics
.where("#{table_name}.topic_id IS NULL")
.find_each do |t|
vector_rep.generate_topic_representation_from(t)
rebaked += 1
end
return if rebaked >= limit
# Then, we'll try to backfill embeddings for topics that have outdated
# embeddings, be it model or strategy version
topics
.where(<<~SQL)
#{table_name}.model_version < #{vector_rep.version}
OR
#{table_name}.strategy_version < #{strategy.version}
SQL
.find_each do |t|
vector_rep.generate_topic_representation_from(t)
rebaked += 1
end
return if rebaked >= limit
# Finally, we'll try to backfill embeddings for topics that have outdated
# embeddings due to edits or new replies. Here we only do 10% of the limit
topics
.reorder("random()")
.limit((limit - rebaked) / 10)
.find_each do |t|
vector_rep.generate_topic_representation_from(t)
rebaked += 1
end
return rebaked
end
end
end