Work in progress... post embeddings and version

Feel free to continue any work here, will pick it up again next week
This commit is contained in:
Sam Saffron 2023-05-12 15:28:12 +10:00
parent 93d9d9ea91
commit 6e6ced4554
No known key found for this signature in database
GPG Key ID: B9606168D2FFD9F5
4 changed files with 44 additions and 13 deletions

View File

@ -47,6 +47,7 @@ en:
ai_embeddings_generate_for_pms: "Generate embeddings for personal messages."
ai_embeddings_semantic_related_topics_enabled: "Use Semantic Search for related topics."
ai_embeddings_semantic_related_topics: "Maximum number of topics to show in related topic section."
ai_embeddings_semantic_related_topics_include_closed_topics: "Include closed topics in related topics."
ai_embeddings_pg_connection_string: "PostgreSQL connection string for the embeddings module. Needs pgvector extension enabled and a series of tables created. See docs for more info."
ai_embeddings_semantic_search_model: "Model to use for semantic search."
ai_embeddings_semantic_search_enabled: "Enable full-page semantic search."

View File

@ -156,7 +156,9 @@ plugins:
ai_embeddings_semantic_related_topics_enabled: false
ai_embeddings_semantic_related_topics: 5
ai_embeddings_semantic_related_include_closed_topics: true
ai_embeddings_pg_connection_string: ""
ai_embeddings_pg_connection_string:
default: ""
secret: true
ai_embeddings_semantic_search_enabled:
default: false
client: true

View File

@ -3,7 +3,9 @@
module DiscourseAi
module Embeddings
class Topic
def generate_and_store_embeddings_for(topic)
VERSION = 1
def generate_and_store_embeddings_for(topic, include_posts: true)
return unless SiteSetting.ai_embeddings_enabled
return if topic.blank? || topic.first_post.blank?
@ -13,6 +15,18 @@ module DiscourseAi
enabled_models.each do |model|
embedding = model.generate_embedding(topic.first_post.raw)
persist_embedding(topic, model, embedding) if embedding
if include_posts
persist_embedding(topic.first_post, model, embedding) if embedding
topic
.posts
.where("post_number > 1 AND post_type = 1")
.each do |post|
embedding = model.generate_embedding(post.raw)
persist_embedding(post, model, embedding) if embedding
end
end
end
end
@ -70,13 +84,20 @@ module DiscourseAi
private
def persist_embedding(topic, model, embedding)
DiscourseAi::Database::Connection.db.exec(<<~SQL, topic_id: topic.id, embedding: embedding)
INSERT INTO topic_embeddings_#{model.name.underscore} (topic_id, embedding)
VALUES (:topic_id, '[:embedding]')
ON CONFLICT (topic_id)
def persist_embedding(topic_or_post, model, embedding)
table = topic_or_post.is_a?(Topic) ? "topic" : "post"
DiscourseAi::Database::Connection.db.exec(
<<~SQL,
INSERT INTO #{table}_embeddings_#{model.name.underscore} (#{table}_id, embedding, version)
VALUES (:id, '[:embedding]', :version)
ON CONFLICT (#{table}_id)
DO UPDATE SET embedding = '[:embedding]'
SQL
id: topic_or_post.id,
embedding: embedding,
version: VERSION,
)
end
end
end

View File

@ -7,17 +7,24 @@ task "ai:embeddings:create_table" => [:environment] do
SQL
DiscourseAi::Embeddings::Model.enabled_models.each do |model|
DiscourseAi::Database::Connection.db.exec(<<~SQL)
CREATE TABLE IF NOT EXISTS topic_embeddings_#{model.name.underscore} (
topic_id bigint PRIMARY KEY,
embedding vector(#{model.dimensions})
);
%w[topic post].each do |table|
table_name = "#{table}_embeddings_#{model.name.underscore}"
DiscourseAi::Database::Connection.db.exec(<<~SQL)
CREATE TABLE IF NOT EXISTS #{table_name} (
#{table}_id bigint PRIMARY KEY,
embedding vector(#{model.dimensions}),
version smallint
)
SQL
DiscourseAi::Database::Connection.db.exec(<<~SQL)
ALTER TABLE #{table_name} ADD COLUMN IF NOT EXISTS version smallint
SQL
end
end
end
desc "Backfill embeddings for all topics"
task "ai:embeddings:backfill", [:start_topic] => [:environment] do
task "ai:embeddings:backfill", [:start_topic] => [:environment] do |_, args|
public_categories = Category.where(read_restricted: false).pluck(:id)
topic_embeddings = DiscourseAi::Embeddings::Topic.new
Topic