PERF: Optimize backfill query to prevent statement timeouts (#1066)

This commit is contained in:
Roman Rizzi 2025-01-14 15:39:19 -03:00 committed by GitHub
parent 6721c6751d
commit c4d2b7de1d
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 21 additions and 17 deletions

View File

@ -6,16 +6,18 @@ class BackfillTopicEmbeddings < ActiveRecord::Migration[7.2]
loop do loop do
count = execute(<<~SQL).cmd_tuples count = execute(<<~SQL).cmd_tuples
INSERT INTO ai_topics_embeddings (topic_id, model_id, model_version, strategy_id, strategy_version, digest, embeddings, created_at, updated_at) INSERT INTO ai_topics_embeddings (topic_id, model_id, model_version, strategy_id, strategy_version, digest, embeddings, created_at, updated_at)
SELECT source.* SELECT source.*
FROM ai_topic_embeddings source FROM (
WHERE NOT EXISTS ( SELECT old_table.*
SELECT 1 FROM ai_topic_embeddings old_table
FROM ai_topics_embeddings target LEFT JOIN ai_topics_embeddings target ON (
WHERE target.model_id = source.model_id target.model_id = old_table.model_id AND
AND target.strategy_id = source.strategy_id target.strategy_id = old_table.strategy_id AND
AND target.topic_id = source.topic_id target.topic_id = old_table.topic_id
) )
WHERE target.topic_id IS NULL
LIMIT 10000 LIMIT 10000
) source
SQL SQL
break if count == 0 break if count == 0

View File

@ -7,17 +7,19 @@ class BackfillPostEmbeddings < ActiveRecord::Migration[7.2]
loop do loop do
count = execute(<<~SQL).cmd_tuples count = execute(<<~SQL).cmd_tuples
INSERT INTO ai_posts_embeddings (post_id, model_id, model_version, strategy_id, strategy_version, digest, embeddings, created_at, updated_at) INSERT INTO ai_posts_embeddings (post_id, model_id, model_version, strategy_id, strategy_version, digest, embeddings, created_at, updated_at)
SELECT source.* SELECT source.*
FROM ai_post_embeddings source FROM (
WHERE NOT EXISTS ( SELECT old_table.*
SELECT 1 FROM ai_post_embeddings old_table
FROM ai_posts_embeddings target LEFT JOIN ai_posts_embeddings target ON (
WHERE target.model_id = source.model_id target.model_id = old_table.model_id AND
AND target.strategy_id = source.strategy_id target.strategy_id = old_table.strategy_id AND
AND target.post_id = source.post_id target.post_id = old_table.post_id
) )
WHERE target.post_id IS NULL
LIMIT 10000 LIMIT 10000
) source
SQL SQL
break if count == 0 break if count == 0