mirror of
https://github.com/discourse/discourse-ai.git
synced 2025-02-17 17:04:48 +00:00
* FEATURE: Embeddings to main db This commit moves our embeddings store from an external configurable PostgreSQL instance back into the main database. This is done to simplify the setup. There is a migration that will try to import the external embeddings into the main DB if it is configured and there are rows. It removes support from embeddings models that aren't all_mpnet_base_v2 or OpenAI text_embedding_ada_002. However it will now be easier to add new models. It also now takes into account: - topic title - topic category - topic tags - replies (as much as the model allows) We introduce an interface so we can eventually support multiple strategies for handling long topics. This PR severely damages the semantic search performance, but this is a temporary until we can get adapt HyDE to make semantic search use the same embeddings we have for semantic related with good performance. Here we also have some ground work to add post level embeddings, but this will be added in a future PR. Please note that this PR will also block Discourse from booting / updating if this plugin is installed and the pgvector extension isn't available on the PostgreSQL instance Discourse uses.
64 lines
2.0 KiB
Ruby
64 lines
2.0 KiB
Ruby
# frozen_string_literal: true
|
|
|
|
class MigrateEmbeddingsFromDedicatedDatabase < ActiveRecord::Migration[7.0]
|
|
def up
|
|
return unless SiteSetting.ai_embeddings_enabled
|
|
return unless SiteSetting.ai_embeddings_pg_connection_string.present?
|
|
|
|
models = [
|
|
DiscourseAi::Embeddings::Models::AllMpnetBaseV2,
|
|
DiscourseAi::Embeddings::Models::TextEmbeddingAda002,
|
|
]
|
|
strategies = [DiscourseAi::Embeddings::Strategies::Truncation]
|
|
|
|
models.each do |model|
|
|
strategies.each do |strategy|
|
|
new_table_name = "ai_topic_embeddings_#{model.id}_#{strategy.id}"
|
|
old_table_name = "topic_embeddings_#{model.name.underscore}"
|
|
|
|
begin
|
|
row_count =
|
|
DiscourseAi::Database::Connection
|
|
.db
|
|
.query_single("SELECT COUNT(*) FROM #{old_table_name}")
|
|
.first
|
|
|
|
if row_count > 0
|
|
puts "Migrating #{row_count} embeddings from #{old_table_name} to #{new_table_name}"
|
|
|
|
last_topic_id = 0
|
|
|
|
loop do
|
|
batch = DiscourseAi::Database::Connection.db.query(<<-SQL)
|
|
SELECT topic_id, embedding
|
|
FROM #{old_table_name}
|
|
WHERE topic_id > #{last_topic_id}
|
|
ORDER BY topic_id ASC
|
|
LIMIT 50
|
|
SQL
|
|
break if batch.empty?
|
|
|
|
DB.exec(<<-SQL)
|
|
INSERT INTO #{new_table_name} (topic_id, model_version, strategy_version, digest, embeddings, created_at, updated_at)
|
|
VALUES #{batch.map { |r| "(#{r.topic_id}, 0, 0, '', '#{r.embedding}', CURRENT_TIMESTAMP, CURRENT_TIMESTAMP)" }.join(", ")}
|
|
ON CONFLICT (topic_id)
|
|
DO NOTHING
|
|
SQL
|
|
|
|
last_topic_id = batch.last.topic_id
|
|
end
|
|
end
|
|
rescue PG::Error => e
|
|
Rails.logger.error(
|
|
"Error #{e} migrating embeddings from #{old_table_name} to #{new_table_name}",
|
|
)
|
|
end
|
|
end
|
|
end
|
|
end
|
|
|
|
def down
|
|
# no-op
|
|
end
|
|
end
|