FIX: we were never reindexing old content (#786)

* FIX: we were never reindexing old content

Embedding backfill contains logic for searching for old content
change and then backfilling.

Unfortunately it was excluding all topics that had embedding
unconditionally, leading to no backfill ever happening.


This change adds a test and ensures we backfill.

* over select results, this ensures we will be more likely to find
ai results when filtered
This commit is contained in:
Sam 2024-08-30 14:37:55 +10:00 committed by GitHub
parent eee8e72756
commit 584753cf60
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 46 additions and 13 deletions

View File

@ -54,9 +54,12 @@ module Jobs
# Finally, we'll try to backfill embeddings for topics that have outdated
# embeddings due to edits or new replies. Here we only do 10% of the limit
relation =
topics.where("#{table_name}.updated_at < ?", 7.days.ago).limit((limit - rebaked) / 10)
topics
.where("#{table_name}.updated_at < ?", 6.hours.ago)
.where("#{table_name}.updated_at < topics.updated_at")
.limit((limit - rebaked) / 10)
populate_topic_embeddings(vector_rep, relation)
populate_topic_embeddings(vector_rep, relation, force: true)
return if rebaked >= limit
@ -115,9 +118,12 @@ module Jobs
private
def populate_topic_embeddings(vector_rep, topics)
def populate_topic_embeddings(vector_rep, topics, force: false)
done = 0
ids = topics.where("#{vector_rep.topic_table_name}.topic_id IS NULL").pluck("topics.id")
topics = topics.where("#{vector_rep.topic_table_name}.topic_id IS NULL") if !force
ids = topics.pluck("topics.id")
ids.each do |id|
topic = Topic.find_by(id: id)

View File

@ -67,6 +67,11 @@ module DiscourseAi
.fetch(embedding_key, expires_in: 1.week) { vector_rep.vector_from(search_term) }
end
# this ensures the candidate topics are over selected
# that way we have a much better chance of finding topics
# if the user filtered the results or index is a bit out of date
OVER_SELECTION_FACTOR = 4
def search_for_topics(query, page = 1, hyde: true)
max_results_per_page = 100
limit = [Search.per_filter, max_results_per_page].min + 1
@ -78,10 +83,12 @@ module DiscourseAi
search_embedding = hyde ? hyde_embedding(search_term) : embedding(search_term)
over_selection_limit = limit * OVER_SELECTION_FACTOR
candidate_topic_ids =
vector_rep.asymmetric_topics_similarity_search(
search_embedding,
limit: limit,
limit: over_selection_limit,
offset: offset,
)
@ -92,6 +99,7 @@ module DiscourseAi
.where("topics.visible")
.where(topic_id: candidate_topic_ids, post_number: 1)
.order("array_position(ARRAY#{candidate_topic_ids}, posts.topic_id)")
.limit(limit)
query_filter_results = search.apply_filters(semantic_results)

View File

@ -425,14 +425,14 @@ module DiscourseAi
DB.exec(
<<~SQL,
INSERT INTO #{topic_table_name} (topic_id, model_id, model_version, strategy_id, strategy_version, digest, embeddings, created_at, updated_at)
VALUES (:topic_id, :model_id, :model_version, :strategy_id, :strategy_version, :digest, '[:embeddings]', CURRENT_TIMESTAMP, CURRENT_TIMESTAMP)
VALUES (:topic_id, :model_id, :model_version, :strategy_id, :strategy_version, :digest, '[:embeddings]', :now, :now)
ON CONFLICT (strategy_id, model_id, topic_id)
DO UPDATE SET
model_version = :model_version,
strategy_version = :strategy_version,
digest = :digest,
embeddings = '[:embeddings]',
updated_at = CURRENT_TIMESTAMP
updated_at = :now
SQL
topic_id: target.id,
model_id: id,
@ -441,19 +441,20 @@ module DiscourseAi
strategy_version: @strategy.version,
digest: digest,
embeddings: vector,
now: Time.zone.now,
)
elsif target.is_a?(Post)
DB.exec(
<<~SQL,
INSERT INTO #{post_table_name} (post_id, model_id, model_version, strategy_id, strategy_version, digest, embeddings, created_at, updated_at)
VALUES (:post_id, :model_id, :model_version, :strategy_id, :strategy_version, :digest, '[:embeddings]', CURRENT_TIMESTAMP, CURRENT_TIMESTAMP)
VALUES (:post_id, :model_id, :model_version, :strategy_id, :strategy_version, :digest, '[:embeddings]', :now, :now)
ON CONFLICT (model_id, strategy_id, post_id)
DO UPDATE SET
model_version = :model_version,
strategy_version = :strategy_version,
digest = :digest,
embeddings = '[:embeddings]',
updated_at = CURRENT_TIMESTAMP
updated_at = :now
SQL
post_id: target.id,
model_id: id,
@ -462,19 +463,20 @@ module DiscourseAi
strategy_version: @strategy.version,
digest: digest,
embeddings: vector,
now: Time.zone.now,
)
elsif target.is_a?(RagDocumentFragment)
DB.exec(
<<~SQL,
INSERT INTO #{rag_fragments_table_name} (rag_document_fragment_id, model_id, model_version, strategy_id, strategy_version, digest, embeddings, created_at, updated_at)
VALUES (:fragment_id, :model_id, :model_version, :strategy_id, :strategy_version, :digest, '[:embeddings]', CURRENT_TIMESTAMP, CURRENT_TIMESTAMP)
VALUES (:fragment_id, :model_id, :model_version, :strategy_id, :strategy_version, :digest, '[:embeddings]', :now, :now)
ON CONFLICT (model_id, strategy_id, rag_document_fragment_id)
DO UPDATE SET
model_version = :model_version,
strategy_version = :strategy_version,
digest = :digest,
embeddings = '[:embeddings]',
updated_at = CURRENT_TIMESTAMP
updated_at = :now
SQL
fragment_id: target.id,
model_id: id,
@ -483,6 +485,7 @@ module DiscourseAi
strategy_version: @strategy.version,
digest: digest,
embeddings: vector,
now: Time.zone.now,
)
else
raise ArgumentError, "Invalid target type"

View File

@ -24,13 +24,14 @@ RSpec.describe Jobs::EmbeddingsBackfill do
DiscourseAi::Embeddings::VectorRepresentations::Base.current_representation(strategy)
end
it "backfills topics based on bumped_at date" do
before do
SiteSetting.ai_embeddings_enabled = true
SiteSetting.ai_embeddings_discourse_service_api_endpoint = "http://test.com"
SiteSetting.ai_embeddings_backfill_batch_size = 1
Jobs.run_immediately!
end
it "backfills topics based on bumped_at date" do
embedding = Array.new(1024) { 1 }
WebMock.stub_request(
@ -51,5 +52,20 @@ RSpec.describe Jobs::EmbeddingsBackfill do
topic_ids = DB.query_single("SELECT topic_id from #{vector_rep.topic_table_name}")
expect(topic_ids).to contain_exactly(first_topic.id, second_topic.id, third_topic.id)
freeze_time 1.day.from_now
# new title forces a reindex
third_topic.update!(updated_at: Time.zone.now, title: "new title - 123")
Jobs::EmbeddingsBackfill.new.execute({})
index_date =
DB.query_single(
"SELECT updated_at from #{vector_rep.topic_table_name} WHERE topic_id = ?",
third_topic.id,
).first
expect(index_date).to be_within_one_second_of(Time.zone.now)
end
end