FIX: Update migrations with latest vector rep changes (#199)

This commit is contained in:
Roman Rizzi 2023-09-05 14:31:04 -03:00 committed by GitHub
parent ee734a340a
commit 175def1267
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 53 additions and 62 deletions

View File

@ -2,26 +2,23 @@
class CreateAiTopicEmbeddingsTable < ActiveRecord::Migration[7.0] class CreateAiTopicEmbeddingsTable < ActiveRecord::Migration[7.0]
def change def change
models = [ truncation = DiscourseAi::Embeddings::Strategies::Truncation.new
DiscourseAi::Embeddings::Models::AllMpnetBaseV2, vector_reps =
DiscourseAi::Embeddings::Models::TextEmbeddingAda002, [
] DiscourseAi::Embeddings::VectorRepresentations::AllMpnetBaseV2,
strategies = [DiscourseAi::Embeddings::Strategies::Truncation] DiscourseAi::Embeddings::VectorRepresentations::TextEmbeddingAda002,
].map { |k| k.new(truncation) }
models.each do |model| vector_reps.each do |vector_rep|
strategies.each do |strategy| create_table vector_rep.table_name.to_sym, id: false do |t|
table_name = "ai_topic_embeddings_#{model.id}_#{strategy.id}".to_sym t.integer :topic_id, null: false
t.integer :model_version, null: false
t.integer :strategy_version, null: false
t.text :digest, null: false
t.column :embeddings, "vector(#{vector_rep.dimensions})", null: false
t.timestamps
create_table table_name, id: false do |t| t.index :topic_id, unique: true
t.integer :topic_id, null: false
t.integer :model_version, null: false
t.integer :strategy_version, null: false
t.text :digest, null: false
t.column :embeddings, "vector(#{model.dimensions})", null: false
t.timestamps
t.index :topic_id, unique: true
end
end end
end end
end end

View File

@ -5,54 +5,54 @@ class MigrateEmbeddingsFromDedicatedDatabase < ActiveRecord::Migration[7.0]
return unless SiteSetting.ai_embeddings_enabled return unless SiteSetting.ai_embeddings_enabled
return unless SiteSetting.ai_embeddings_pg_connection_string.present? return unless SiteSetting.ai_embeddings_pg_connection_string.present?
models = [ truncation = DiscourseAi::Embeddings::Strategies::Truncation.new
DiscourseAi::Embeddings::Models::AllMpnetBaseV2,
DiscourseAi::Embeddings::Models::TextEmbeddingAda002,
]
strategies = [DiscourseAi::Embeddings::Strategies::Truncation]
models.each do |model| vector_reps =
strategies.each do |strategy| [
new_table_name = "ai_topic_embeddings_#{model.id}_#{strategy.id}" DiscourseAi::Embeddings::VectorRepresentations::AllMpnetBaseV2,
old_table_name = "topic_embeddings_#{model.name.underscore}" DiscourseAi::Embeddings::VectorRepresentations::TextEmbeddingAda002,
].map { |k| k.new(truncation) }
begin vector_reps.each do |vector_rep|
row_count = new_table_name = vector_rep.table_name
DiscourseAi::Database::Connection old_table_name = "topic_embeddings_#{vector_rep.name.underscore}"
.db
.query_single("SELECT COUNT(*) FROM #{old_table_name}")
.first
if row_count > 0 begin
puts "Migrating #{row_count} embeddings from #{old_table_name} to #{new_table_name}" row_count =
DiscourseAi::Database::Connection
.db
.query_single("SELECT COUNT(*) FROM #{old_table_name}")
.first
last_topic_id = 0 if row_count > 0
puts "Migrating #{row_count} embeddings from #{old_table_name} to #{new_table_name}"
loop do last_topic_id = 0
batch = DiscourseAi::Database::Connection.db.query(<<-SQL)
loop do
batch = DiscourseAi::Database::Connection.db.query(<<-SQL)
SELECT topic_id, embedding SELECT topic_id, embedding
FROM #{old_table_name} FROM #{old_table_name}
WHERE topic_id > #{last_topic_id} WHERE topic_id > #{last_topic_id}
ORDER BY topic_id ASC ORDER BY topic_id ASC
LIMIT 50 LIMIT 50
SQL SQL
break if batch.empty? break if batch.empty?
DB.exec(<<-SQL) DB.exec(<<-SQL)
INSERT INTO #{new_table_name} (topic_id, model_version, strategy_version, digest, embeddings, created_at, updated_at) INSERT INTO #{new_table_name} (topic_id, model_version, strategy_version, digest, embeddings, created_at, updated_at)
VALUES #{batch.map { |r| "(#{r.topic_id}, 0, 0, '', '#{r.embedding}', CURRENT_TIMESTAMP, CURRENT_TIMESTAMP)" }.join(", ")} VALUES #{batch.map { |r| "(#{r.topic_id}, 0, 0, '', '#{r.embedding}', CURRENT_TIMESTAMP, CURRENT_TIMESTAMP)" }.join(", ")}
ON CONFLICT (topic_id) ON CONFLICT (topic_id)
DO NOTHING DO NOTHING
SQL SQL
last_topic_id = batch.last.topic_id last_topic_id = batch.last.topic_id
end
end end
rescue PG::Error => e
Rails.logger.error(
"Error #{e} migrating embeddings from #{old_table_name} to #{new_table_name}",
)
end end
rescue PG::Error => e
Rails.logger.error(
"Error #{e} migrating embeddings from #{old_table_name} to #{new_table_name}",
)
end end
end end
end end

View File

@ -2,24 +2,18 @@
class CreateMultilingualTopicEmbeddingsTable < ActiveRecord::Migration[7.0] class CreateMultilingualTopicEmbeddingsTable < ActiveRecord::Migration[7.0]
def change def change
models = [DiscourseAi::Embeddings::Models::MultilingualE5Large] truncation = DiscourseAi::Embeddings::Strategies::Truncation.new
strategies = [DiscourseAi::Embeddings::Strategies::Truncation] vector_rep = DiscourseAi::Embeddings::VectorRepresentations::MultilingualE5Large.new(truncation)
models.each do |model| create_table vector_rep.table_name.to_sym, id: false do |t|
strategies.each do |strategy| t.integer :topic_id, null: false
table_name = "ai_topic_embeddings_#{model.id}_#{strategy.id}".to_sym t.integer :model_version, null: false
t.integer :strategy_version, null: false
t.text :digest, null: false
t.column :embeddings, "vector(#{vector_rep.dimensions})", null: false
t.timestamps
create_table table_name, id: false do |t| t.index :topic_id, unique: true
t.integer :topic_id, null: false
t.integer :model_version, null: false
t.integer :strategy_version, null: false
t.text :digest, null: false
t.column :embeddings, "vector(#{model.dimensions})", null: false
t.timestamps
t.index :topic_id, unique: true
end
end
end end
end end
end end