FIX: Proper flow when a topic doesn't have embeddings (#20)

This commit is contained in:
Rafael dos Santos Silva 2023-03-20 16:44:55 -03:00 committed by GitHub
parent fea9041ee1
commit 6bdbc0e32d
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 19 additions and 10 deletions

View File

@ -23,17 +23,11 @@ module DiscourseAi
Discourse
.cache
.fetch("semantic-suggested-topic-#{topic.id}", expires_in: cache_for) do
suggested = search_suggestions(topic)
# Happens when the topic doesn't have any embeddings
if suggested.empty? || !suggested.include?(topic.id)
return { result: [], params: {} }
end
suggested
search_suggestions(topic)
end
rescue StandardError => e
Rails.logger.error("SemanticSuggested: #{e}")
return { result: [], params: {} }
end
# array_position forces the order of the topics to be preserved
@ -49,6 +43,7 @@ module DiscourseAi
function =
DiscourseAi::Embeddings::Models::SEARCH_FUNCTION_TO_PG_FUNCTION[model.functions.first]
candidate_ids =
DiscourseAi::Database::Connection.db.query(<<~SQL, topic_id: topic.id).map(&:topic_id)
SELECT
topic_id
@ -66,6 +61,14 @@ module DiscourseAi
)
LIMIT 11
SQL
# Happens when the topic doesn't have any embeddings
# I'd rather not use Exceptions to control the flow, so this should be refactored soon
if candidate_ids.empty? || !candidate_ids.include?(topic.id)
raise StandardError, "No embeddings found for topic #{topic.id}"
end
candidate_ids
end
end
end

View File

@ -2,6 +2,10 @@
desc "Creates tables to store embeddings"
task "ai:embeddings:create_table" => [:environment] do
DiscourseAi::Database::Connection.db.exec(<<~SQL)
CREATE EXTENSION IF NOT EXISTS pg_vector;
SQL
DiscourseAi::Embeddings::Models.enabled_models.each do |model|
DiscourseAi::Database::Connection.db.exec(<<~SQL)
CREATE TABLE IF NOT EXISTS topic_embeddings_#{model.name.underscore} (
@ -25,12 +29,13 @@ task "ai:embeddings:backfill" => [:environment] do
end
desc "Creates indexes for embeddings"
task "ai:embeddings:index" => [:environment] do
task "ai:embeddings:index", [:work_mem] => [:environment] do |_, args|
# Using 4 * sqrt(number of topics) as a rule of thumb for now
# Results are not as good as without indexes, but it's much faster
# Disk usage is ~1x the size of the table, so this double table total size
lists = 4 * Math.sqrt(Topic.count).to_i
DiscourseAi::Database::Connection.db.exec("SET work_mem TO '#{args[:work_mem] || "1GB"}';")
DiscourseAi::Embeddings::Models.enabled_models.each do |model|
DiscourseAi::Database::Connection.db.exec(<<~SQL)
CREATE INDEX IF NOT EXISTS
@ -42,5 +47,6 @@ task "ai:embeddings:index" => [:environment] do
WITH
(lists = #{lists});
SQL
DiscourseAi::Database::Connection.db.exec("RESET work_mem;")
end
end