From f1133f66a652975593299fe445f4418898cb7c7f Mon Sep 17 00:00:00 2001 From: Rafael dos Santos Silva Date: Tue, 9 May 2023 13:45:16 -0300 Subject: [PATCH] Updates to embedding rake tasks (#54) - Creates embeddings in topic ID order, so it's easier to stop and restart from where we stopped - Update index parameters with current best practices --- lib/tasks/modules/embeddings/database.rake | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/lib/tasks/modules/embeddings/database.rake b/lib/tasks/modules/embeddings/database.rake index 57707d87..a0ea1935 100644 --- a/lib/tasks/modules/embeddings/database.rake +++ b/lib/tasks/modules/embeddings/database.rake @@ -17,12 +17,14 @@ task "ai:embeddings:create_table" => [:environment] do end desc "Backfill embeddings for all topics" -task "ai:embeddings:backfill" => [:environment] do +task "ai:embeddings:backfill", [:start_topic] => [:environment] do public_categories = Category.where(read_restricted: false).pluck(:id) topic_embeddings = DiscourseAi::Embeddings::Topic.new Topic + .where("id >= ?", args[:start_topic] || 0) .where("category_id IN (?)", public_categories) .where(deleted_at: nil) + .order(id: :asc) .find_each do |t| print "." topic_embeddings.generate_and_store_embeddings_for(t) @@ -31,10 +33,12 @@ end desc "Creates indexes for embeddings" task "ai:embeddings:index", [:work_mem] => [:environment] do |_, args| - # Using 4 * sqrt(number of topics) as a rule of thumb for now + # Using extension maintainer's recommendation for ivfflat indexes # Results are not as good as without indexes, but it's much faster # Disk usage is ~1x the size of the table, so this double table total size - lists = 4 * Math.sqrt(Topic.count).to_i + count = Topic.count + lists = count < 1_000_000 ? count / 1000 : Math.sqrt(count).to_i + probes = count < 1_000_000 ? lists / 10 : Math.sqrt(lists).to_i DiscourseAi::Database::Connection.db.exec("SET work_mem TO '#{args[:work_mem] || "1GB"}';") DiscourseAi::Embeddings::Model.enabled_models.each do |model| @@ -48,6 +52,7 @@ task "ai:embeddings:index", [:work_mem] => [:environment] do |_, args| WITH (lists = #{lists}); SQL - DiscourseAi::Database::Connection.db.exec("RESET work_mem;") end + DiscourseAi::Database::Connection.db.exec("RESET work_mem;") + DiscourseAi::Database::Connection.db.exec("SET ivfflat.probes = #{probes};") end