From 6e6ced4554d12eb0455fd0569eae8216d56612ff Mon Sep 17 00:00:00 2001 From: Sam Saffron Date: Fri, 12 May 2023 15:28:12 +1000 Subject: [PATCH] Work in progress... post embeddings and version Feel free to continue any work here, will pick it up again next week --- config/locales/server.en.yml | 1 + config/settings.yml | 4 ++- lib/modules/embeddings/topic.rb | 33 ++++++++++++++++++---- lib/tasks/modules/embeddings/database.rake | 19 +++++++++---- 4 files changed, 44 insertions(+), 13 deletions(-) diff --git a/config/locales/server.en.yml b/config/locales/server.en.yml index 4bed6676..3e182e5b 100644 --- a/config/locales/server.en.yml +++ b/config/locales/server.en.yml @@ -47,6 +47,7 @@ en: ai_embeddings_generate_for_pms: "Generate embeddings for personal messages." ai_embeddings_semantic_related_topics_enabled: "Use Semantic Search for related topics." ai_embeddings_semantic_related_topics: "Maximum number of topics to show in related topic section." + ai_embeddings_semantic_related_topics_include_closed_topics: "Include closed topics in related topics." ai_embeddings_pg_connection_string: "PostgreSQL connection string for the embeddings module. Needs pgvector extension enabled and a series of tables created. See docs for more info." ai_embeddings_semantic_search_model: "Model to use for semantic search." ai_embeddings_semantic_search_enabled: "Enable full-page semantic search." diff --git a/config/settings.yml b/config/settings.yml index 1c751a64..6ad61ac7 100644 --- a/config/settings.yml +++ b/config/settings.yml @@ -156,7 +156,9 @@ plugins: ai_embeddings_semantic_related_topics_enabled: false ai_embeddings_semantic_related_topics: 5 ai_embeddings_semantic_related_include_closed_topics: true - ai_embeddings_pg_connection_string: "" + ai_embeddings_pg_connection_string: + default: "" + secret: true ai_embeddings_semantic_search_enabled: default: false client: true diff --git a/lib/modules/embeddings/topic.rb b/lib/modules/embeddings/topic.rb index 8e009a34..94f61239 100644 --- a/lib/modules/embeddings/topic.rb +++ b/lib/modules/embeddings/topic.rb @@ -3,7 +3,9 @@ module DiscourseAi module Embeddings class Topic - def generate_and_store_embeddings_for(topic) + VERSION = 1 + + def generate_and_store_embeddings_for(topic, include_posts: true) return unless SiteSetting.ai_embeddings_enabled return if topic.blank? || topic.first_post.blank? @@ -13,6 +15,18 @@ module DiscourseAi enabled_models.each do |model| embedding = model.generate_embedding(topic.first_post.raw) persist_embedding(topic, model, embedding) if embedding + + if include_posts + persist_embedding(topic.first_post, model, embedding) if embedding + + topic + .posts + .where("post_number > 1 AND post_type = 1") + .each do |post| + embedding = model.generate_embedding(post.raw) + persist_embedding(post, model, embedding) if embedding + end + end end end @@ -70,13 +84,20 @@ module DiscourseAi private - def persist_embedding(topic, model, embedding) - DiscourseAi::Database::Connection.db.exec(<<~SQL, topic_id: topic.id, embedding: embedding) - INSERT INTO topic_embeddings_#{model.name.underscore} (topic_id, embedding) - VALUES (:topic_id, '[:embedding]') - ON CONFLICT (topic_id) + def persist_embedding(topic_or_post, model, embedding) + table = topic_or_post.is_a?(Topic) ? "topic" : "post" + + DiscourseAi::Database::Connection.db.exec( + <<~SQL, + INSERT INTO #{table}_embeddings_#{model.name.underscore} (#{table}_id, embedding, version) + VALUES (:id, '[:embedding]', :version) + ON CONFLICT (#{table}_id) DO UPDATE SET embedding = '[:embedding]' SQL + id: topic_or_post.id, + embedding: embedding, + version: VERSION, + ) end end end diff --git a/lib/tasks/modules/embeddings/database.rake b/lib/tasks/modules/embeddings/database.rake index a0ea1935..380d1f4d 100644 --- a/lib/tasks/modules/embeddings/database.rake +++ b/lib/tasks/modules/embeddings/database.rake @@ -7,17 +7,24 @@ task "ai:embeddings:create_table" => [:environment] do SQL DiscourseAi::Embeddings::Model.enabled_models.each do |model| - DiscourseAi::Database::Connection.db.exec(<<~SQL) - CREATE TABLE IF NOT EXISTS topic_embeddings_#{model.name.underscore} ( - topic_id bigint PRIMARY KEY, - embedding vector(#{model.dimensions}) - ); + %w[topic post].each do |table| + table_name = "#{table}_embeddings_#{model.name.underscore}" + DiscourseAi::Database::Connection.db.exec(<<~SQL) + CREATE TABLE IF NOT EXISTS #{table_name} ( + #{table}_id bigint PRIMARY KEY, + embedding vector(#{model.dimensions}), + version smallint + ) SQL + DiscourseAi::Database::Connection.db.exec(<<~SQL) + ALTER TABLE #{table_name} ADD COLUMN IF NOT EXISTS version smallint + SQL + end end end desc "Backfill embeddings for all topics" -task "ai:embeddings:backfill", [:start_topic] => [:environment] do +task "ai:embeddings:backfill", [:start_topic] => [:environment] do |_, args| public_categories = Category.where(read_restricted: false).pluck(:id) topic_embeddings = DiscourseAi::Embeddings::Topic.new Topic