FEATURE: Automatic embeddings backfill
This commit is contained in:
parent
0a76bea076
commit
7174fff7f0
|
@ -59,6 +59,7 @@ en:
|
|||
ai_embeddings_generate_for_pms: "Generate embeddings for personal messages."
|
||||
ai_embeddings_semantic_related_topics_enabled: "Use Semantic Search for related topics."
|
||||
ai_embeddings_semantic_related_topics: "Maximum number of topics to show in related topic section."
|
||||
ai_embeddings_backfill_batch_size: "Number of embeddings to backfill every 15 minutes."
|
||||
ai_embeddings_pg_connection_string: "PostgreSQL connection string for the embeddings module. Needs pgvector extension enabled and a series of tables created. See docs for more info."
|
||||
ai_embeddings_semantic_search_enabled: "Enable full-page semantic search."
|
||||
ai_embeddings_semantic_related_include_closed_topics: "Include closed topics in semantic search results"
|
||||
|
|
|
@ -199,6 +199,9 @@ discourse_ai:
|
|||
client: true
|
||||
ai_embeddings_semantic_related_topics: 5
|
||||
ai_embeddings_semantic_related_include_closed_topics: true
|
||||
ai_embeddings_backfill_batch_size:
|
||||
default: 250
|
||||
hidden: true
|
||||
ai_embeddings_pg_connection_string:
|
||||
default: ""
|
||||
hidden: true
|
||||
|
|
|
@ -0,0 +1,65 @@
|
|||
# frozen_string_literal: true
|
||||
|
||||
module Jobs
|
||||
class EmbeddingsBackfill < ::Jobs::Base
|
||||
every 15.minutes
|
||||
sidekiq_options queue: "low"
|
||||
cluster_concurrency 1
|
||||
|
||||
def execute(args)
|
||||
return unless SiteSetting.ai_embeddings_enabled
|
||||
|
||||
limit = SiteSetting.ai_embeddings_backfill_batch_size
|
||||
rebaked = 0
|
||||
|
||||
strategy = DiscourseAi::Embeddings::Strategies::Truncation.new
|
||||
vector_rep =
|
||||
DiscourseAi::Embeddings::VectorRepresentations::Base.current_representation(strategy)
|
||||
table_name = vector_rep.table_name
|
||||
|
||||
topics =
|
||||
Topic
|
||||
.joins("LEFT JOIN #{table_name} ON #{table_name}.topic_id = topics.id")
|
||||
.where(deleted_at: nil)
|
||||
.order("#{table_name}.updated_at ASC NULLS FIRST, topics.id DESC")
|
||||
.limit(limit - rebaked)
|
||||
|
||||
# First, we'll try to backfill embeddings for topics that have none
|
||||
topics
|
||||
.where("#{table_name}.topic_id IS NULL")
|
||||
.find_each do |t|
|
||||
vector_rep.generate_topic_representation_from(t)
|
||||
rebaked += 1
|
||||
end
|
||||
|
||||
return if rebaked >= limit
|
||||
|
||||
# Then, we'll try to backfill embeddings for topics that have outdated
|
||||
# embeddings, be it model or strategy version
|
||||
topics
|
||||
.where(<<~SQL)
|
||||
#{table_name}.model_version < #{vector_rep.version}
|
||||
OR
|
||||
#{table_name}.strategy_version < #{strategy.version}
|
||||
SQL
|
||||
.find_each do |t|
|
||||
vector_rep.generate_topic_representation_from(t)
|
||||
rebaked += 1
|
||||
end
|
||||
|
||||
return if rebaked >= limit
|
||||
|
||||
# Finally, we'll try to backfill embeddings for topics that have outdated
|
||||
# embeddings due to edits or new replies. Here we only do 10% of the limit
|
||||
topics
|
||||
.reorder("random()")
|
||||
.limit((limit - rebaked) / 10)
|
||||
.find_each do |t|
|
||||
vector_rep.generate_topic_representation_from(t)
|
||||
rebaked += 1
|
||||
end
|
||||
|
||||
return rebaked
|
||||
end
|
||||
end
|
||||
end
|
Loading…
Reference in New Issue