FIX: More resilient sentiment backfill query (#998)

This commit is contained in:
Rafael dos Santos Silva 2024-12-04 12:10:31 -03:00 committed by GitHub
parent e7c2cd861a
commit 938d4c018c
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
1 changed files with 33 additions and 33 deletions

View File

@ -5,43 +5,43 @@ module DiscourseAi
class PostClassification class PostClassification
def self.backfill_query(from_post_id: nil, max_age_days: nil) def self.backfill_query(from_post_id: nil, max_age_days: nil)
available_classifier_names = available_classifier_names =
DiscourseAi::Sentiment::SentimentSiteSettingJsonSchema DiscourseAi::Sentiment::SentimentSiteSettingJsonSchema.values.map { _1.model_name }
.values
.map { |mc| mc.model_name.downcase }
.sort
base_query = queries =
Post available_classifier_names.map do |classifier_name|
.includes(:sentiment_classifications) base_query =
.joins("INNER JOIN topics ON topics.id = posts.topic_id") Post
.where(post_type: Post.types[:regular]) .includes(:sentiment_classifications)
.where.not(topics: { archetype: Archetype.private_message }) .joins("INNER JOIN topics ON topics.id = posts.topic_id")
.where(posts: { deleted_at: nil }) .where(post_type: Post.types[:regular])
.where(topics: { deleted_at: nil }) .where.not(topics: { archetype: Archetype.private_message })
.joins(<<~SQL) .where(posts: { deleted_at: nil })
LEFT JOIN classification_results crs .where(topics: { deleted_at: nil })
ON crs.target_id = posts.id .joins(<<~SQL)
AND crs.target_type = 'Post' LEFT JOIN classification_results crs
AND crs.classification_type = 'sentiment' ON crs.target_id = posts.id
SQL AND crs.target_type = 'Post'
.group("posts.id") AND crs.classification_type = 'sentiment'
.having(<<~SQL, available_classifier_names) AND crs.model_used = '#{classifier_name}'
COUNT(crs.model_used) = 0 SQL
OR array_agg( .where("crs.id IS NULL")
DISTINCT LOWER(crs.model_used) ORDER BY LOWER(crs.model_used)
)::text[] IS DISTINCT FROM array[?]
SQL
base_query = base_query.where("posts.id >= ?", from_post_id.to_i) if from_post_id.present? base_query =
base_query.where("posts.id >= ?", from_post_id.to_i) if from_post_id.present?
if max_age_days.present? if max_age_days.present?
base_query = base_query =
base_query.where( base_query.where(
"posts.created_at > current_date - INTERVAL '#{max_age_days.to_i} DAY'", "posts.created_at > current_date - INTERVAL '#{max_age_days.to_i} DAY'",
) )
end end
base_query base_query
end
unioned_queries = queries.map(&:to_sql).join(" UNION ")
Post.from(Arel.sql("(#{unioned_queries}) as posts"))
end end
def bulk_classify!(relation) def bulk_classify!(relation)