From 584753cf608222cbddbb6d55d3d7010a79549695 Mon Sep 17 00:00:00 2001 From: Sam Date: Fri, 30 Aug 2024 14:37:55 +1000 Subject: [PATCH] FIX: we were never reindexing old content (#786) * FIX: we were never reindexing old content Embedding backfill contains logic for searching for old content change and then backfilling. Unfortunately it was excluding all topics that had embedding unconditionally, leading to no backfill ever happening. This change adds a test and ensures we backfill. * over select results, this ensures we will be more likely to find ai results when filtered --- app/jobs/scheduled/embeddings_backfill.rb | 14 +++++++++---- lib/embeddings/semantic_search.rb | 10 +++++++++- lib/embeddings/vector_representations/base.rb | 15 ++++++++------ .../scheduled/embeddings_backfill_spec.rb | 20 +++++++++++++++++-- 4 files changed, 46 insertions(+), 13 deletions(-) diff --git a/app/jobs/scheduled/embeddings_backfill.rb b/app/jobs/scheduled/embeddings_backfill.rb index a42eae4a..28b7c25a 100644 --- a/app/jobs/scheduled/embeddings_backfill.rb +++ b/app/jobs/scheduled/embeddings_backfill.rb @@ -54,9 +54,12 @@ module Jobs # Finally, we'll try to backfill embeddings for topics that have outdated # embeddings due to edits or new replies. Here we only do 10% of the limit relation = - topics.where("#{table_name}.updated_at < ?", 7.days.ago).limit((limit - rebaked) / 10) + topics + .where("#{table_name}.updated_at < ?", 6.hours.ago) + .where("#{table_name}.updated_at < topics.updated_at") + .limit((limit - rebaked) / 10) - populate_topic_embeddings(vector_rep, relation) + populate_topic_embeddings(vector_rep, relation, force: true) return if rebaked >= limit @@ -115,9 +118,12 @@ module Jobs private - def populate_topic_embeddings(vector_rep, topics) + def populate_topic_embeddings(vector_rep, topics, force: false) done = 0 - ids = topics.where("#{vector_rep.topic_table_name}.topic_id IS NULL").pluck("topics.id") + + topics = topics.where("#{vector_rep.topic_table_name}.topic_id IS NULL") if !force + + ids = topics.pluck("topics.id") ids.each do |id| topic = Topic.find_by(id: id) diff --git a/lib/embeddings/semantic_search.rb b/lib/embeddings/semantic_search.rb index 779f644a..02d8dd11 100644 --- a/lib/embeddings/semantic_search.rb +++ b/lib/embeddings/semantic_search.rb @@ -67,6 +67,11 @@ module DiscourseAi .fetch(embedding_key, expires_in: 1.week) { vector_rep.vector_from(search_term) } end + # this ensures the candidate topics are over selected + # that way we have a much better chance of finding topics + # if the user filtered the results or index is a bit out of date + OVER_SELECTION_FACTOR = 4 + def search_for_topics(query, page = 1, hyde: true) max_results_per_page = 100 limit = [Search.per_filter, max_results_per_page].min + 1 @@ -78,10 +83,12 @@ module DiscourseAi search_embedding = hyde ? hyde_embedding(search_term) : embedding(search_term) + over_selection_limit = limit * OVER_SELECTION_FACTOR + candidate_topic_ids = vector_rep.asymmetric_topics_similarity_search( search_embedding, - limit: limit, + limit: over_selection_limit, offset: offset, ) @@ -92,6 +99,7 @@ module DiscourseAi .where("topics.visible") .where(topic_id: candidate_topic_ids, post_number: 1) .order("array_position(ARRAY#{candidate_topic_ids}, posts.topic_id)") + .limit(limit) query_filter_results = search.apply_filters(semantic_results) diff --git a/lib/embeddings/vector_representations/base.rb b/lib/embeddings/vector_representations/base.rb index f55b341c..5febc5f2 100644 --- a/lib/embeddings/vector_representations/base.rb +++ b/lib/embeddings/vector_representations/base.rb @@ -425,14 +425,14 @@ module DiscourseAi DB.exec( <<~SQL, INSERT INTO #{topic_table_name} (topic_id, model_id, model_version, strategy_id, strategy_version, digest, embeddings, created_at, updated_at) - VALUES (:topic_id, :model_id, :model_version, :strategy_id, :strategy_version, :digest, '[:embeddings]', CURRENT_TIMESTAMP, CURRENT_TIMESTAMP) + VALUES (:topic_id, :model_id, :model_version, :strategy_id, :strategy_version, :digest, '[:embeddings]', :now, :now) ON CONFLICT (strategy_id, model_id, topic_id) DO UPDATE SET model_version = :model_version, strategy_version = :strategy_version, digest = :digest, embeddings = '[:embeddings]', - updated_at = CURRENT_TIMESTAMP + updated_at = :now SQL topic_id: target.id, model_id: id, @@ -441,19 +441,20 @@ module DiscourseAi strategy_version: @strategy.version, digest: digest, embeddings: vector, + now: Time.zone.now, ) elsif target.is_a?(Post) DB.exec( <<~SQL, INSERT INTO #{post_table_name} (post_id, model_id, model_version, strategy_id, strategy_version, digest, embeddings, created_at, updated_at) - VALUES (:post_id, :model_id, :model_version, :strategy_id, :strategy_version, :digest, '[:embeddings]', CURRENT_TIMESTAMP, CURRENT_TIMESTAMP) + VALUES (:post_id, :model_id, :model_version, :strategy_id, :strategy_version, :digest, '[:embeddings]', :now, :now) ON CONFLICT (model_id, strategy_id, post_id) DO UPDATE SET model_version = :model_version, strategy_version = :strategy_version, digest = :digest, embeddings = '[:embeddings]', - updated_at = CURRENT_TIMESTAMP + updated_at = :now SQL post_id: target.id, model_id: id, @@ -462,19 +463,20 @@ module DiscourseAi strategy_version: @strategy.version, digest: digest, embeddings: vector, + now: Time.zone.now, ) elsif target.is_a?(RagDocumentFragment) DB.exec( <<~SQL, INSERT INTO #{rag_fragments_table_name} (rag_document_fragment_id, model_id, model_version, strategy_id, strategy_version, digest, embeddings, created_at, updated_at) - VALUES (:fragment_id, :model_id, :model_version, :strategy_id, :strategy_version, :digest, '[:embeddings]', CURRENT_TIMESTAMP, CURRENT_TIMESTAMP) + VALUES (:fragment_id, :model_id, :model_version, :strategy_id, :strategy_version, :digest, '[:embeddings]', :now, :now) ON CONFLICT (model_id, strategy_id, rag_document_fragment_id) DO UPDATE SET model_version = :model_version, strategy_version = :strategy_version, digest = :digest, embeddings = '[:embeddings]', - updated_at = CURRENT_TIMESTAMP + updated_at = :now SQL fragment_id: target.id, model_id: id, @@ -483,6 +485,7 @@ module DiscourseAi strategy_version: @strategy.version, digest: digest, embeddings: vector, + now: Time.zone.now, ) else raise ArgumentError, "Invalid target type" diff --git a/spec/jobs/scheduled/embeddings_backfill_spec.rb b/spec/jobs/scheduled/embeddings_backfill_spec.rb index 74cec6dc..88cd8976 100644 --- a/spec/jobs/scheduled/embeddings_backfill_spec.rb +++ b/spec/jobs/scheduled/embeddings_backfill_spec.rb @@ -24,13 +24,14 @@ RSpec.describe Jobs::EmbeddingsBackfill do DiscourseAi::Embeddings::VectorRepresentations::Base.current_representation(strategy) end - it "backfills topics based on bumped_at date" do + before do SiteSetting.ai_embeddings_enabled = true SiteSetting.ai_embeddings_discourse_service_api_endpoint = "http://test.com" SiteSetting.ai_embeddings_backfill_batch_size = 1 - Jobs.run_immediately! + end + it "backfills topics based on bumped_at date" do embedding = Array.new(1024) { 1 } WebMock.stub_request( @@ -51,5 +52,20 @@ RSpec.describe Jobs::EmbeddingsBackfill do topic_ids = DB.query_single("SELECT topic_id from #{vector_rep.topic_table_name}") expect(topic_ids).to contain_exactly(first_topic.id, second_topic.id, third_topic.id) + + freeze_time 1.day.from_now + + # new title forces a reindex + third_topic.update!(updated_at: Time.zone.now, title: "new title - 123") + + Jobs::EmbeddingsBackfill.new.execute({}) + + index_date = + DB.query_single( + "SELECT updated_at from #{vector_rep.topic_table_name} WHERE topic_id = ?", + third_topic.id, + ).first + + expect(index_date).to be_within_one_second_of(Time.zone.now) end end