mirror of
https://github.com/discourse/discourse-ai.git
synced 2025-03-09 11:48:47 +00:00
Updates to embedding rake tasks (#54)
- Creates embeddings in topic ID order, so it's easier to stop and restart from where we stopped - Update index parameters with current best practices
This commit is contained in:
parent
e76fc77189
commit
f1133f66a6
@ -17,12 +17,14 @@ task "ai:embeddings:create_table" => [:environment] do
|
|||||||
end
|
end
|
||||||
|
|
||||||
desc "Backfill embeddings for all topics"
|
desc "Backfill embeddings for all topics"
|
||||||
task "ai:embeddings:backfill" => [:environment] do
|
task "ai:embeddings:backfill", [:start_topic] => [:environment] do
|
||||||
public_categories = Category.where(read_restricted: false).pluck(:id)
|
public_categories = Category.where(read_restricted: false).pluck(:id)
|
||||||
topic_embeddings = DiscourseAi::Embeddings::Topic.new
|
topic_embeddings = DiscourseAi::Embeddings::Topic.new
|
||||||
Topic
|
Topic
|
||||||
|
.where("id >= ?", args[:start_topic] || 0)
|
||||||
.where("category_id IN (?)", public_categories)
|
.where("category_id IN (?)", public_categories)
|
||||||
.where(deleted_at: nil)
|
.where(deleted_at: nil)
|
||||||
|
.order(id: :asc)
|
||||||
.find_each do |t|
|
.find_each do |t|
|
||||||
print "."
|
print "."
|
||||||
topic_embeddings.generate_and_store_embeddings_for(t)
|
topic_embeddings.generate_and_store_embeddings_for(t)
|
||||||
@ -31,10 +33,12 @@ end
|
|||||||
|
|
||||||
desc "Creates indexes for embeddings"
|
desc "Creates indexes for embeddings"
|
||||||
task "ai:embeddings:index", [:work_mem] => [:environment] do |_, args|
|
task "ai:embeddings:index", [:work_mem] => [:environment] do |_, args|
|
||||||
# Using 4 * sqrt(number of topics) as a rule of thumb for now
|
# Using extension maintainer's recommendation for ivfflat indexes
|
||||||
# Results are not as good as without indexes, but it's much faster
|
# Results are not as good as without indexes, but it's much faster
|
||||||
# Disk usage is ~1x the size of the table, so this double table total size
|
# Disk usage is ~1x the size of the table, so this double table total size
|
||||||
lists = 4 * Math.sqrt(Topic.count).to_i
|
count = Topic.count
|
||||||
|
lists = count < 1_000_000 ? count / 1000 : Math.sqrt(count).to_i
|
||||||
|
probes = count < 1_000_000 ? lists / 10 : Math.sqrt(lists).to_i
|
||||||
|
|
||||||
DiscourseAi::Database::Connection.db.exec("SET work_mem TO '#{args[:work_mem] || "1GB"}';")
|
DiscourseAi::Database::Connection.db.exec("SET work_mem TO '#{args[:work_mem] || "1GB"}';")
|
||||||
DiscourseAi::Embeddings::Model.enabled_models.each do |model|
|
DiscourseAi::Embeddings::Model.enabled_models.each do |model|
|
||||||
@ -48,6 +52,7 @@ task "ai:embeddings:index", [:work_mem] => [:environment] do |_, args|
|
|||||||
WITH
|
WITH
|
||||||
(lists = #{lists});
|
(lists = #{lists});
|
||||||
SQL
|
SQL
|
||||||
DiscourseAi::Database::Connection.db.exec("RESET work_mem;")
|
|
||||||
end
|
end
|
||||||
|
DiscourseAi::Database::Connection.db.exec("RESET work_mem;")
|
||||||
|
DiscourseAi::Database::Connection.db.exec("SET ivfflat.probes = #{probes};")
|
||||||
end
|
end
|
||||||
|
Loading…
x
Reference in New Issue
Block a user