FIX: Proper flow when a topic doesn't have embeddings (#20)

2023-03-20 16:44:55 -03:00 · 2023-03-20 16:44:55 -03:00 · 6bdbc0e32d
parent fea9041ee1
commit 6bdbc0e32d
2 changed files with 19 additions and 10 deletions
--- a/lib/modules/embeddings/semantic_suggested.rb
+++ b/lib/modules/embeddings/semantic_suggested.rb
@ -23,17 +23,11 @@ module DiscourseAi
            Discourse
              .cache
              .fetch("semantic-suggested-topic-#{topic.id}", expires_in: cache_for) do
-                suggested = search_suggestions(topic)
-
-                # Happens when the topic doesn't have any embeddings
-                if suggested.empty? || !suggested.include?(topic.id)
-                  return { result: [], params: {} }
-                end
-
-                suggested
+                search_suggestions(topic)
              end
        rescue StandardError => e
          Rails.logger.error("SemanticSuggested: #{e}")
+          return { result: [], params: {} }
        end

        # array_position forces the order of the topics to be preserved
@ -49,7 +43,8 @@ module DiscourseAi
        function =
          DiscourseAi::Embeddings::Models::SEARCH_FUNCTION_TO_PG_FUNCTION[model.functions.first]

-        DiscourseAi::Database::Connection.db.query(<<~SQL, topic_id: topic.id).map(&:topic_id)
+        candidate_ids =
+          DiscourseAi::Database::Connection.db.query(<<~SQL, topic_id: topic.id).map(&:topic_id)
          SELECT
            topic_id
          FROM
@ -66,6 +61,14 @@ module DiscourseAi
            )
          LIMIT 11
        SQL
+
+        # Happens when the topic doesn't have any embeddings
+        # I'd rather not use Exceptions to control the flow, so this should be refactored soon
+        if candidate_ids.empty? || !candidate_ids.include?(topic.id)
+          raise StandardError, "No embeddings found for topic #{topic.id}"
+        end
+
+        candidate_ids
      end
    end
  end
--- a/lib/tasks/modules/embeddings/database.rake
+++ b/lib/tasks/modules/embeddings/database.rake
@ -2,6 +2,10 @@

 desc "Creates tables to store embeddings"
 task "ai:embeddings:create_table" => [:environment] do
+  DiscourseAi::Database::Connection.db.exec(<<~SQL)
+    CREATE EXTENSION IF NOT EXISTS pg_vector;
+  SQL
+
  DiscourseAi::Embeddings::Models.enabled_models.each do |model|
    DiscourseAi::Database::Connection.db.exec(<<~SQL)
        CREATE TABLE IF NOT EXISTS topic_embeddings_#{model.name.underscore} (
@ -25,12 +29,13 @@ task "ai:embeddings:backfill" => [:environment] do
 end

 desc "Creates indexes for embeddings"
-task "ai:embeddings:index" => [:environment] do
+task "ai:embeddings:index", [:work_mem] => [:environment] do |_, args|
  # Using 4 * sqrt(number of topics) as a rule of thumb for now
  # Results are not as good as without indexes, but it's much faster
  # Disk usage is ~1x the size of the table, so this double table total size
  lists = 4 * Math.sqrt(Topic.count).to_i

+  DiscourseAi::Database::Connection.db.exec("SET work_mem TO '#{args[:work_mem] || "1GB"}';")
  DiscourseAi::Embeddings::Models.enabled_models.each do |model|
    DiscourseAi::Database::Connection.db.exec(<<~SQL)
      CREATE INDEX IF NOT EXISTS
@ -42,5 +47,6 @@ task "ai:embeddings:index" => [:environment] do
      WITH
        (lists = #{lists});
    SQL
+    DiscourseAi::Database::Connection.db.exec("RESET work_mem;")
  end
 end