Roman Rizzi 4e05763a99
FEATURE: Semantic assymetric full-page search (#34)
Depends on discourse/discourse#20915

Hooks to the full-page-search component using an experimental API and performs an assymetric similarity search using our embeddings database.
2023-03-31 15:29:56 -03:00

54 lines
1.8 KiB
Ruby

# frozen_string_literal: true
desc "Creates tables to store embeddings"
task "ai:embeddings:create_table" => [:environment] do
DiscourseAi::Database::Connection.db.exec(<<~SQL)
CREATE EXTENSION IF NOT EXISTS vector;
SQL
DiscourseAi::Embeddings::Model.enabled_models.each do |model|
DiscourseAi::Database::Connection.db.exec(<<~SQL)
CREATE TABLE IF NOT EXISTS topic_embeddings_#{model.name.underscore} (
topic_id bigint PRIMARY KEY,
embedding vector(#{model.dimensions})
);
SQL
end
end
desc "Backfill embeddings for all topics"
task "ai:embeddings:backfill" => [:environment] do
public_categories = Category.where(read_restricted: false).pluck(:id)
topic_embeddings = DiscourseAi::Embeddings::Topic.new
Topic
.where("category_id IN (?)", public_categories)
.where(deleted_at: nil)
.find_each do |t|
print "."
topic_embeddings.generate_and_store_embeddings_for(t)
end
end
desc "Creates indexes for embeddings"
task "ai:embeddings:index", [:work_mem] => [:environment] do |_, args|
# Using 4 * sqrt(number of topics) as a rule of thumb for now
# Results are not as good as without indexes, but it's much faster
# Disk usage is ~1x the size of the table, so this double table total size
lists = 4 * Math.sqrt(Topic.count).to_i
DiscourseAi::Database::Connection.db.exec("SET work_mem TO '#{args[:work_mem] || "1GB"}';")
DiscourseAi::Embeddings::Model.enabled_models.each do |model|
DiscourseAi::Database::Connection.db.exec(<<~SQL)
CREATE INDEX IF NOT EXISTS
topic_embeddings_#{model.name.underscore}_search
ON
topic_embeddings_#{model.name.underscore}
USING
ivfflat (embedding #{model.pg_index})
WITH
(lists = #{lists});
SQL
DiscourseAi::Database::Connection.db.exec("RESET work_mem;")
end
end