From 175def1267f882e49edb4920dd98a68ea6d59ddc Mon Sep 17 00:00:00 2001 From: Roman Rizzi Date: Tue, 5 Sep 2023 14:31:04 -0300 Subject: [PATCH] FIX: Update migrations with latest vector rep changes (#199) --- ...171142_create_ai_topic_embeddings_table.rb | 33 +++++------ ...rate_embeddings_from_dedicated_database.rb | 56 +++++++++---------- ...ate_multilingual_topic_embeddings_table.rb | 26 ++++----- 3 files changed, 53 insertions(+), 62 deletions(-) diff --git a/db/migrate/20230710171142_create_ai_topic_embeddings_table.rb b/db/migrate/20230710171142_create_ai_topic_embeddings_table.rb index 1bc70cd1..50626c87 100644 --- a/db/migrate/20230710171142_create_ai_topic_embeddings_table.rb +++ b/db/migrate/20230710171142_create_ai_topic_embeddings_table.rb @@ -2,26 +2,23 @@ class CreateAiTopicEmbeddingsTable < ActiveRecord::Migration[7.0] def change - models = [ - DiscourseAi::Embeddings::Models::AllMpnetBaseV2, - DiscourseAi::Embeddings::Models::TextEmbeddingAda002, - ] - strategies = [DiscourseAi::Embeddings::Strategies::Truncation] + truncation = DiscourseAi::Embeddings::Strategies::Truncation.new + vector_reps = + [ + DiscourseAi::Embeddings::VectorRepresentations::AllMpnetBaseV2, + DiscourseAi::Embeddings::VectorRepresentations::TextEmbeddingAda002, + ].map { |k| k.new(truncation) } - models.each do |model| - strategies.each do |strategy| - table_name = "ai_topic_embeddings_#{model.id}_#{strategy.id}".to_sym + vector_reps.each do |vector_rep| + create_table vector_rep.table_name.to_sym, id: false do |t| + t.integer :topic_id, null: false + t.integer :model_version, null: false + t.integer :strategy_version, null: false + t.text :digest, null: false + t.column :embeddings, "vector(#{vector_rep.dimensions})", null: false + t.timestamps - create_table table_name, id: false do |t| - t.integer :topic_id, null: false - t.integer :model_version, null: false - t.integer :strategy_version, null: false - t.text :digest, null: false - t.column :embeddings, "vector(#{model.dimensions})", null: false - t.timestamps - - t.index :topic_id, unique: true - end + t.index :topic_id, unique: true end end end diff --git a/db/migrate/20230710171143_migrate_embeddings_from_dedicated_database.rb b/db/migrate/20230710171143_migrate_embeddings_from_dedicated_database.rb index 406e9337..70eaa864 100644 --- a/db/migrate/20230710171143_migrate_embeddings_from_dedicated_database.rb +++ b/db/migrate/20230710171143_migrate_embeddings_from_dedicated_database.rb @@ -5,54 +5,54 @@ class MigrateEmbeddingsFromDedicatedDatabase < ActiveRecord::Migration[7.0] return unless SiteSetting.ai_embeddings_enabled return unless SiteSetting.ai_embeddings_pg_connection_string.present? - models = [ - DiscourseAi::Embeddings::Models::AllMpnetBaseV2, - DiscourseAi::Embeddings::Models::TextEmbeddingAda002, - ] - strategies = [DiscourseAi::Embeddings::Strategies::Truncation] + truncation = DiscourseAi::Embeddings::Strategies::Truncation.new - models.each do |model| - strategies.each do |strategy| - new_table_name = "ai_topic_embeddings_#{model.id}_#{strategy.id}" - old_table_name = "topic_embeddings_#{model.name.underscore}" + vector_reps = + [ + DiscourseAi::Embeddings::VectorRepresentations::AllMpnetBaseV2, + DiscourseAi::Embeddings::VectorRepresentations::TextEmbeddingAda002, + ].map { |k| k.new(truncation) } - begin - row_count = - DiscourseAi::Database::Connection - .db - .query_single("SELECT COUNT(*) FROM #{old_table_name}") - .first + vector_reps.each do |vector_rep| + new_table_name = vector_rep.table_name + old_table_name = "topic_embeddings_#{vector_rep.name.underscore}" - if row_count > 0 - puts "Migrating #{row_count} embeddings from #{old_table_name} to #{new_table_name}" + begin + row_count = + DiscourseAi::Database::Connection + .db + .query_single("SELECT COUNT(*) FROM #{old_table_name}") + .first - last_topic_id = 0 + if row_count > 0 + puts "Migrating #{row_count} embeddings from #{old_table_name} to #{new_table_name}" - loop do - batch = DiscourseAi::Database::Connection.db.query(<<-SQL) + last_topic_id = 0 + + loop do + batch = DiscourseAi::Database::Connection.db.query(<<-SQL) SELECT topic_id, embedding FROM #{old_table_name} WHERE topic_id > #{last_topic_id} ORDER BY topic_id ASC LIMIT 50 SQL - break if batch.empty? + break if batch.empty? - DB.exec(<<-SQL) + DB.exec(<<-SQL) INSERT INTO #{new_table_name} (topic_id, model_version, strategy_version, digest, embeddings, created_at, updated_at) VALUES #{batch.map { |r| "(#{r.topic_id}, 0, 0, '', '#{r.embedding}', CURRENT_TIMESTAMP, CURRENT_TIMESTAMP)" }.join(", ")} ON CONFLICT (topic_id) DO NOTHING SQL - last_topic_id = batch.last.topic_id - end + last_topic_id = batch.last.topic_id end - rescue PG::Error => e - Rails.logger.error( - "Error #{e} migrating embeddings from #{old_table_name} to #{new_table_name}", - ) end + rescue PG::Error => e + Rails.logger.error( + "Error #{e} migrating embeddings from #{old_table_name} to #{new_table_name}", + ) end end end diff --git a/db/migrate/20230727170222_create_multilingual_topic_embeddings_table.rb b/db/migrate/20230727170222_create_multilingual_topic_embeddings_table.rb index 92fae68f..1ca179ee 100644 --- a/db/migrate/20230727170222_create_multilingual_topic_embeddings_table.rb +++ b/db/migrate/20230727170222_create_multilingual_topic_embeddings_table.rb @@ -2,24 +2,18 @@ class CreateMultilingualTopicEmbeddingsTable < ActiveRecord::Migration[7.0] def change - models = [DiscourseAi::Embeddings::Models::MultilingualE5Large] - strategies = [DiscourseAi::Embeddings::Strategies::Truncation] + truncation = DiscourseAi::Embeddings::Strategies::Truncation.new + vector_rep = DiscourseAi::Embeddings::VectorRepresentations::MultilingualE5Large.new(truncation) - models.each do |model| - strategies.each do |strategy| - table_name = "ai_topic_embeddings_#{model.id}_#{strategy.id}".to_sym + create_table vector_rep.table_name.to_sym, id: false do |t| + t.integer :topic_id, null: false + t.integer :model_version, null: false + t.integer :strategy_version, null: false + t.text :digest, null: false + t.column :embeddings, "vector(#{vector_rep.dimensions})", null: false + t.timestamps - create_table table_name, id: false do |t| - t.integer :topic_id, null: false - t.integer :model_version, null: false - t.integer :strategy_version, null: false - t.text :digest, null: false - t.column :embeddings, "vector(#{model.dimensions})", null: false - t.timestamps - - t.index :topic_id, unique: true - end - end + t.index :topic_id, unique: true end end end