Updates to embedding rake tasks (#54)

- Creates embeddings in topic ID order, so it's easier to stop and restart from where we stopped - Update index parameters with current best practices
2025-09-08 20:50:38 +00:00 · 2023-05-09 13:45:16 -03:00 · 2023-05-09 13:45:16 -03:00 · f1133f66a6
commit f1133f66a6
parent e76fc77189
1 changed files with 9 additions and 4 deletions
--- a/lib/tasks/modules/embeddings/database.rake
+++ b/lib/tasks/modules/embeddings/database.rake
@ -17,12 +17,14 @@ task "ai:embeddings:create_table" => [:environment] do
 end
 desc "Backfill embeddings for all topics"
-task "ai:embeddings:backfill" => [:environment] do
+task "ai:embeddings:backfill", [:start_topic] => [:environment] do
  public_categories = Category.where(read_restricted: false).pluck(:id)
  topic_embeddings = DiscourseAi::Embeddings::Topic.new
  Topic
    .where("id >= ?", args[:start_topic] || 0)
    .where("category_id IN (?)", public_categories)
    .where(deleted_at: nil)
    .order(id: :asc)
    .find_each do |t|
      print "."
      topic_embeddings.generate_and_store_embeddings_for(t)
@ -31,10 +33,12 @@ end
 desc "Creates indexes for embeddings"
 task "ai:embeddings:index", [:work_mem] => [:environment] do |_, args|
-  # Using 4 * sqrt(number of topics) as a rule of thumb for now
+  # Using extension maintainer's recommendation for ivfflat indexes
  # Results are not as good as without indexes, but it's much faster
  # Disk usage is ~1x the size of the table, so this double table total size
-  lists = 4 * Math.sqrt(Topic.count).to_i
+  count = Topic.count
  lists = count < 1_000_000 ? count / 1000 : Math.sqrt(count).to_i
  probes = count < 1_000_000 ? lists / 10 : Math.sqrt(lists).to_i
  DiscourseAi::Database::Connection.db.exec("SET work_mem TO '#{args[:work_mem] || "1GB"}';")
  DiscourseAi::Embeddings::Model.enabled_models.each do |model|
@ -48,6 +52,7 @@ task "ai:embeddings:index", [:work_mem] => [:environment] do |_, args|
      WITH
        (lists = #{lists});
    SQL
    DiscourseAi::Database::Connection.db.exec("RESET work_mem;")
  end
  DiscourseAi::Database::Connection.db.exec("RESET work_mem;")
  DiscourseAi::Database::Connection.db.exec("SET ivfflat.probes = #{probes};")
 end