diff --git a/app/jobs/scheduled/embeddings_backfill.rb b/app/jobs/scheduled/embeddings_backfill.rb index a42eae4a..28b7c25a 100644 --- a/app/jobs/scheduled/embeddings_backfill.rb +++ b/app/jobs/scheduled/embeddings_backfill.rb @@ -54,9 +54,12 @@ module Jobs # Finally, we'll try to backfill embeddings for topics that have outdated # embeddings due to edits or new replies. Here we only do 10% of the limit relation = - topics.where("#{table_name}.updated_at < ?", 7.days.ago).limit((limit - rebaked) / 10) + topics + .where("#{table_name}.updated_at < ?", 6.hours.ago) + .where("#{table_name}.updated_at < topics.updated_at") + .limit((limit - rebaked) / 10) - populate_topic_embeddings(vector_rep, relation) + populate_topic_embeddings(vector_rep, relation, force: true) return if rebaked >= limit @@ -115,9 +118,12 @@ module Jobs private - def populate_topic_embeddings(vector_rep, topics) + def populate_topic_embeddings(vector_rep, topics, force: false) done = 0 - ids = topics.where("#{vector_rep.topic_table_name}.topic_id IS NULL").pluck("topics.id") + + topics = topics.where("#{vector_rep.topic_table_name}.topic_id IS NULL") if !force + + ids = topics.pluck("topics.id") ids.each do |id| topic = Topic.find_by(id: id) diff --git a/lib/embeddings/semantic_search.rb b/lib/embeddings/semantic_search.rb index 779f644a..02d8dd11 100644 --- a/lib/embeddings/semantic_search.rb +++ b/lib/embeddings/semantic_search.rb @@ -67,6 +67,11 @@ module DiscourseAi .fetch(embedding_key, expires_in: 1.week) { vector_rep.vector_from(search_term) } end + # this ensures the candidate topics are over selected + # that way we have a much better chance of finding topics + # if the user filtered the results or index is a bit out of date + OVER_SELECTION_FACTOR = 4 + def search_for_topics(query, page = 1, hyde: true) max_results_per_page = 100 limit = [Search.per_filter, max_results_per_page].min + 1 @@ -78,10 +83,12 @@ module DiscourseAi search_embedding = hyde ? hyde_embedding(search_term) : embedding(search_term) + over_selection_limit = limit * OVER_SELECTION_FACTOR + candidate_topic_ids = vector_rep.asymmetric_topics_similarity_search( search_embedding, - limit: limit, + limit: over_selection_limit, offset: offset, ) @@ -92,6 +99,7 @@ module DiscourseAi .where("topics.visible") .where(topic_id: candidate_topic_ids, post_number: 1) .order("array_position(ARRAY#{candidate_topic_ids}, posts.topic_id)") + .limit(limit) query_filter_results = search.apply_filters(semantic_results) diff --git a/lib/embeddings/vector_representations/base.rb b/lib/embeddings/vector_representations/base.rb index f55b341c..5febc5f2 100644 --- a/lib/embeddings/vector_representations/base.rb +++ b/lib/embeddings/vector_representations/base.rb @@ -425,14 +425,14 @@ module DiscourseAi DB.exec( <<~SQL, INSERT INTO #{topic_table_name} (topic_id, model_id, model_version, strategy_id, strategy_version, digest, embeddings, created_at, updated_at) - VALUES (:topic_id, :model_id, :model_version, :strategy_id, :strategy_version, :digest, '[:embeddings]', CURRENT_TIMESTAMP, CURRENT_TIMESTAMP) + VALUES (:topic_id, :model_id, :model_version, :strategy_id, :strategy_version, :digest, '[:embeddings]', :now, :now) ON CONFLICT (strategy_id, model_id, topic_id) DO UPDATE SET model_version = :model_version, strategy_version = :strategy_version, digest = :digest, embeddings = '[:embeddings]', - updated_at = CURRENT_TIMESTAMP + updated_at = :now SQL topic_id: target.id, model_id: id, @@ -441,19 +441,20 @@ module DiscourseAi strategy_version: @strategy.version, digest: digest, embeddings: vector, + now: Time.zone.now, ) elsif target.is_a?(Post) DB.exec( <<~SQL, INSERT INTO #{post_table_name} (post_id, model_id, model_version, strategy_id, strategy_version, digest, embeddings, created_at, updated_at) - VALUES (:post_id, :model_id, :model_version, :strategy_id, :strategy_version, :digest, '[:embeddings]', CURRENT_TIMESTAMP, CURRENT_TIMESTAMP) + VALUES (:post_id, :model_id, :model_version, :strategy_id, :strategy_version, :digest, '[:embeddings]', :now, :now) ON CONFLICT (model_id, strategy_id, post_id) DO UPDATE SET model_version = :model_version, strategy_version = :strategy_version, digest = :digest, embeddings = '[:embeddings]', - updated_at = CURRENT_TIMESTAMP + updated_at = :now SQL post_id: target.id, model_id: id, @@ -462,19 +463,20 @@ module DiscourseAi strategy_version: @strategy.version, digest: digest, embeddings: vector, + now: Time.zone.now, ) elsif target.is_a?(RagDocumentFragment) DB.exec( <<~SQL, INSERT INTO #{rag_fragments_table_name} (rag_document_fragment_id, model_id, model_version, strategy_id, strategy_version, digest, embeddings, created_at, updated_at) - VALUES (:fragment_id, :model_id, :model_version, :strategy_id, :strategy_version, :digest, '[:embeddings]', CURRENT_TIMESTAMP, CURRENT_TIMESTAMP) + VALUES (:fragment_id, :model_id, :model_version, :strategy_id, :strategy_version, :digest, '[:embeddings]', :now, :now) ON CONFLICT (model_id, strategy_id, rag_document_fragment_id) DO UPDATE SET model_version = :model_version, strategy_version = :strategy_version, digest = :digest, embeddings = '[:embeddings]', - updated_at = CURRENT_TIMESTAMP + updated_at = :now SQL fragment_id: target.id, model_id: id, @@ -483,6 +485,7 @@ module DiscourseAi strategy_version: @strategy.version, digest: digest, embeddings: vector, + now: Time.zone.now, ) else raise ArgumentError, "Invalid target type" diff --git a/spec/jobs/scheduled/embeddings_backfill_spec.rb b/spec/jobs/scheduled/embeddings_backfill_spec.rb index 74cec6dc..88cd8976 100644 --- a/spec/jobs/scheduled/embeddings_backfill_spec.rb +++ b/spec/jobs/scheduled/embeddings_backfill_spec.rb @@ -24,13 +24,14 @@ RSpec.describe Jobs::EmbeddingsBackfill do DiscourseAi::Embeddings::VectorRepresentations::Base.current_representation(strategy) end - it "backfills topics based on bumped_at date" do + before do SiteSetting.ai_embeddings_enabled = true SiteSetting.ai_embeddings_discourse_service_api_endpoint = "http://test.com" SiteSetting.ai_embeddings_backfill_batch_size = 1 - Jobs.run_immediately! + end + it "backfills topics based on bumped_at date" do embedding = Array.new(1024) { 1 } WebMock.stub_request( @@ -51,5 +52,20 @@ RSpec.describe Jobs::EmbeddingsBackfill do topic_ids = DB.query_single("SELECT topic_id from #{vector_rep.topic_table_name}") expect(topic_ids).to contain_exactly(first_topic.id, second_topic.id, third_topic.id) + + freeze_time 1.day.from_now + + # new title forces a reindex + third_topic.update!(updated_at: Time.zone.now, title: "new title - 123") + + Jobs::EmbeddingsBackfill.new.execute({}) + + index_date = + DB.query_single( + "SELECT updated_at from #{vector_rep.topic_table_name} WHERE topic_id = ?", + third_topic.id, + ).first + + expect(index_date).to be_within_one_second_of(Time.zone.now) end end