FIX: Skip records without content to classify (#960)

This commit is contained in:
Roman Rizzi 2024-11-26 15:54:20 -03:00 committed by GitHub
parent ddf2bf7034
commit ef07fcb308
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 24 additions and 14 deletions

View File

@ -70,6 +70,7 @@ module Jobs
Post
.joins("LEFT JOIN #{table_name} ON #{table_name}.post_id = posts.id")
.where(deleted_at: nil)
.where(post_type: Post.types[:regular])
.limit(limit - rebaked)
# First, we'll try to backfill embeddings for posts that have none

View File

@ -61,18 +61,21 @@ module DiscourseAi
embedding_gen = inference_client
promised_embeddings =
relation.map do |record|
materials = { target: record, text: prepare_text(record) }
relation
.map do |record|
prepared_text = prepare_text(record)
next if prepared_text.blank?
Concurrent::Promises
.fulfilled_future(materials, pool)
.then_on(pool) do |w_prepared_text|
w_prepared_text.merge(
embedding: embedding_gen.perform!(w_prepared_text[:text]),
digest: OpenSSL::Digest::SHA1.hexdigest(w_prepared_text[:text]),
)
end
end
Concurrent::Promises
.fulfilled_future({ target: record, text: prepared_text }, pool)
.then_on(pool) do |w_prepared_text|
w_prepared_text.merge(
embedding: embedding_gen.perform!(w_prepared_text[:text]),
digest: OpenSSL::Digest::SHA1.hexdigest(w_prepared_text[:text]),
)
end
end
.compact
Concurrent::Promises
.zip(*promised_embeddings)

View File

@ -81,11 +81,13 @@ module DiscourseAi
end
def prepare_text(record)
if inference_client.class.name.include?("DiscourseClassifier")
return "query: #{super(record)}"
prepared_text = super(record)
if prepared_text.present? && inference_client.class.name.include?("DiscourseClassifier")
return "query: #{prepared_text}"
end
super(record)
prepared_text
end
end
end

View File

@ -79,6 +79,10 @@ RSpec.shared_examples "generates and store embedding using with vector represent
expect(vector_rep.topic_id_from_representation(expected_embedding_1)).to eq(topic.id)
expect(vector_rep.topic_id_from_representation(expected_embedding_1)).to eq(topic.id)
end
it "does nothing if passed record has no content" do
expect { vector_rep.gen_bulk_reprensentations([Topic.new]) }.not_to raise_error
end
end
describe "#asymmetric_topics_similarity_search" do