From 703762a7a9705d9646f50e77bdd517ed46cf7d05 Mon Sep 17 00:00:00 2001 From: Rafael dos Santos Silva Date: Thu, 13 Jul 2023 18:59:25 -0300 Subject: [PATCH] PERF: .find_each instead of .find to save us from memory allocation peaks also Fix embeddings rake task for new db structure --- .../embeddings/strategies/truncation.rb | 2 +- lib/tasks/modules/embeddings/database.rake | 66 +++++++++---------- 2 files changed, 31 insertions(+), 37 deletions(-) diff --git a/lib/modules/embeddings/strategies/truncation.rb b/lib/modules/embeddings/strategies/truncation.rb index 8f5ef2e6..8a9b9fa4 100644 --- a/lib/modules/embeddings/strategies/truncation.rb +++ b/lib/modules/embeddings/strategies/truncation.rb @@ -52,7 +52,7 @@ module DiscourseAi end t << "\n\n" - topic.posts.each do |post| + topic.posts.find_each do |post| t << post.raw break if @tokenizer.size(t) >= @max_length t << "\n\n" diff --git a/lib/tasks/modules/embeddings/database.rake b/lib/tasks/modules/embeddings/database.rake index fccb1592..96b9f15d 100644 --- a/lib/tasks/modules/embeddings/database.rake +++ b/lib/tasks/modules/embeddings/database.rake @@ -1,33 +1,21 @@ # frozen_string_literal: true -desc "Creates tables to store embeddings" -task "ai:embeddings:create_table" => [:environment] do - DiscourseAi::Database::Connection.db.exec(<<~SQL) - CREATE EXTENSION IF NOT EXISTS vector; - SQL - - DiscourseAi::Embeddings::Model.enabled_models.each do |model| - DiscourseAi::Database::Connection.db.exec(<<~SQL) - CREATE TABLE IF NOT EXISTS topic_embeddings_#{model.name.underscore} ( - topic_id bigint PRIMARY KEY, - embedding vector(#{model.dimensions}) - ); - SQL - end -end - desc "Backfill embeddings for all topics" task "ai:embeddings:backfill", [:start_topic] => [:environment] do |_, args| public_categories = Category.where(read_restricted: false).pluck(:id) - topic_embeddings = DiscourseAi::Embeddings::Topic.new + manager = DiscourseAi::Embeddings::Manager.new(Topic.first) Topic - .where("id >= ?", args[:start_topic] || 0) + .joins( + "LEFT JOIN #{manager.topic_embeddings_table} ON #{manager.topic_embeddings_table}.topic_id = topics.id", + ) + .where("#{manager.topic_embeddings_table}.topic_id IS NULL") + .where("topics.id >= ?", args[:start_topic].to_i || 0) .where("category_id IN (?)", public_categories) .where(deleted_at: nil) - .order(id: :asc) + .order("topics.id ASC") .find_each do |t| print "." - topic_embeddings.generate_and_store_embeddings_for(t) + DiscourseAi::Embeddings::Manager.new(t).generate! end end @@ -35,24 +23,30 @@ desc "Creates indexes for embeddings" task "ai:embeddings:index", [:work_mem] => [:environment] do |_, args| # Using extension maintainer's recommendation for ivfflat indexes # Results are not as good as without indexes, but it's much faster - # Disk usage is ~1x the size of the table, so this double table total size + # Disk usage is ~1x the size of the table, so this doubles table total size count = Topic.count lists = count < 1_000_000 ? count / 1000 : Math.sqrt(count).to_i probes = count < 1_000_000 ? lists / 10 : Math.sqrt(lists).to_i - DiscourseAi::Database::Connection.db.exec("SET work_mem TO '#{args[:work_mem] || "1GB"}';") - DiscourseAi::Embeddings::Model.enabled_models.each do |model| - DiscourseAi::Database::Connection.db.exec(<<~SQL) - CREATE INDEX IF NOT EXISTS - topic_embeddings_#{model.name.underscore}_search - ON - topic_embeddings_#{model.name.underscore} - USING - ivfflat (embedding #{model.pg_index}) - WITH - (lists = #{lists}); - SQL - end - DiscourseAi::Database::Connection.db.exec("RESET work_mem;") - DiscourseAi::Database::Connection.db.exec("SET ivfflat.probes = #{probes};") + manager = DiscourseAi::Embeddings::Manager.new(Topic.first) + table = manager.topic_embeddings_table + index = "#{table}_search" + + DB.exec("SET work_mem TO '#{args[:work_mem] || "1GB"}';") + DB.exec(<<~SQL) + DROP INDEX IF EXISTS #{index}; + CREATE INDEX IF NOT EXISTS + #{index} + ON + #{table} + USING + ivfflat (embeddings #{manager.model.pg_index_type}) + WITH + (lists = #{lists}) + WHERE + model_version = #{manager.model.version} AND + strategy_version = #{manager.strategy.version}; + SQL + DB.exec("RESET work_mem;") + DB.exec("SET ivfflat.probes = #{probes};") end