FIX: we were never reindexing old content (#786)
* FIX: we were never reindexing old content Embedding backfill contains logic for searching for old content change and then backfilling. Unfortunately it was excluding all topics that had embedding unconditionally, leading to no backfill ever happening. This change adds a test and ensures we backfill. * over select results, this ensures we will be more likely to find ai results when filtered
This commit is contained in:
parent
eee8e72756
commit
584753cf60
|
@ -54,9 +54,12 @@ module Jobs
|
|||
# Finally, we'll try to backfill embeddings for topics that have outdated
|
||||
# embeddings due to edits or new replies. Here we only do 10% of the limit
|
||||
relation =
|
||||
topics.where("#{table_name}.updated_at < ?", 7.days.ago).limit((limit - rebaked) / 10)
|
||||
topics
|
||||
.where("#{table_name}.updated_at < ?", 6.hours.ago)
|
||||
.where("#{table_name}.updated_at < topics.updated_at")
|
||||
.limit((limit - rebaked) / 10)
|
||||
|
||||
populate_topic_embeddings(vector_rep, relation)
|
||||
populate_topic_embeddings(vector_rep, relation, force: true)
|
||||
|
||||
return if rebaked >= limit
|
||||
|
||||
|
@ -115,9 +118,12 @@ module Jobs
|
|||
|
||||
private
|
||||
|
||||
def populate_topic_embeddings(vector_rep, topics)
|
||||
def populate_topic_embeddings(vector_rep, topics, force: false)
|
||||
done = 0
|
||||
ids = topics.where("#{vector_rep.topic_table_name}.topic_id IS NULL").pluck("topics.id")
|
||||
|
||||
topics = topics.where("#{vector_rep.topic_table_name}.topic_id IS NULL") if !force
|
||||
|
||||
ids = topics.pluck("topics.id")
|
||||
|
||||
ids.each do |id|
|
||||
topic = Topic.find_by(id: id)
|
||||
|
|
|
@ -67,6 +67,11 @@ module DiscourseAi
|
|||
.fetch(embedding_key, expires_in: 1.week) { vector_rep.vector_from(search_term) }
|
||||
end
|
||||
|
||||
# this ensures the candidate topics are over selected
|
||||
# that way we have a much better chance of finding topics
|
||||
# if the user filtered the results or index is a bit out of date
|
||||
OVER_SELECTION_FACTOR = 4
|
||||
|
||||
def search_for_topics(query, page = 1, hyde: true)
|
||||
max_results_per_page = 100
|
||||
limit = [Search.per_filter, max_results_per_page].min + 1
|
||||
|
@ -78,10 +83,12 @@ module DiscourseAi
|
|||
|
||||
search_embedding = hyde ? hyde_embedding(search_term) : embedding(search_term)
|
||||
|
||||
over_selection_limit = limit * OVER_SELECTION_FACTOR
|
||||
|
||||
candidate_topic_ids =
|
||||
vector_rep.asymmetric_topics_similarity_search(
|
||||
search_embedding,
|
||||
limit: limit,
|
||||
limit: over_selection_limit,
|
||||
offset: offset,
|
||||
)
|
||||
|
||||
|
@ -92,6 +99,7 @@ module DiscourseAi
|
|||
.where("topics.visible")
|
||||
.where(topic_id: candidate_topic_ids, post_number: 1)
|
||||
.order("array_position(ARRAY#{candidate_topic_ids}, posts.topic_id)")
|
||||
.limit(limit)
|
||||
|
||||
query_filter_results = search.apply_filters(semantic_results)
|
||||
|
||||
|
|
|
@ -425,14 +425,14 @@ module DiscourseAi
|
|||
DB.exec(
|
||||
<<~SQL,
|
||||
INSERT INTO #{topic_table_name} (topic_id, model_id, model_version, strategy_id, strategy_version, digest, embeddings, created_at, updated_at)
|
||||
VALUES (:topic_id, :model_id, :model_version, :strategy_id, :strategy_version, :digest, '[:embeddings]', CURRENT_TIMESTAMP, CURRENT_TIMESTAMP)
|
||||
VALUES (:topic_id, :model_id, :model_version, :strategy_id, :strategy_version, :digest, '[:embeddings]', :now, :now)
|
||||
ON CONFLICT (strategy_id, model_id, topic_id)
|
||||
DO UPDATE SET
|
||||
model_version = :model_version,
|
||||
strategy_version = :strategy_version,
|
||||
digest = :digest,
|
||||
embeddings = '[:embeddings]',
|
||||
updated_at = CURRENT_TIMESTAMP
|
||||
updated_at = :now
|
||||
SQL
|
||||
topic_id: target.id,
|
||||
model_id: id,
|
||||
|
@ -441,19 +441,20 @@ module DiscourseAi
|
|||
strategy_version: @strategy.version,
|
||||
digest: digest,
|
||||
embeddings: vector,
|
||||
now: Time.zone.now,
|
||||
)
|
||||
elsif target.is_a?(Post)
|
||||
DB.exec(
|
||||
<<~SQL,
|
||||
INSERT INTO #{post_table_name} (post_id, model_id, model_version, strategy_id, strategy_version, digest, embeddings, created_at, updated_at)
|
||||
VALUES (:post_id, :model_id, :model_version, :strategy_id, :strategy_version, :digest, '[:embeddings]', CURRENT_TIMESTAMP, CURRENT_TIMESTAMP)
|
||||
VALUES (:post_id, :model_id, :model_version, :strategy_id, :strategy_version, :digest, '[:embeddings]', :now, :now)
|
||||
ON CONFLICT (model_id, strategy_id, post_id)
|
||||
DO UPDATE SET
|
||||
model_version = :model_version,
|
||||
strategy_version = :strategy_version,
|
||||
digest = :digest,
|
||||
embeddings = '[:embeddings]',
|
||||
updated_at = CURRENT_TIMESTAMP
|
||||
updated_at = :now
|
||||
SQL
|
||||
post_id: target.id,
|
||||
model_id: id,
|
||||
|
@ -462,19 +463,20 @@ module DiscourseAi
|
|||
strategy_version: @strategy.version,
|
||||
digest: digest,
|
||||
embeddings: vector,
|
||||
now: Time.zone.now,
|
||||
)
|
||||
elsif target.is_a?(RagDocumentFragment)
|
||||
DB.exec(
|
||||
<<~SQL,
|
||||
INSERT INTO #{rag_fragments_table_name} (rag_document_fragment_id, model_id, model_version, strategy_id, strategy_version, digest, embeddings, created_at, updated_at)
|
||||
VALUES (:fragment_id, :model_id, :model_version, :strategy_id, :strategy_version, :digest, '[:embeddings]', CURRENT_TIMESTAMP, CURRENT_TIMESTAMP)
|
||||
VALUES (:fragment_id, :model_id, :model_version, :strategy_id, :strategy_version, :digest, '[:embeddings]', :now, :now)
|
||||
ON CONFLICT (model_id, strategy_id, rag_document_fragment_id)
|
||||
DO UPDATE SET
|
||||
model_version = :model_version,
|
||||
strategy_version = :strategy_version,
|
||||
digest = :digest,
|
||||
embeddings = '[:embeddings]',
|
||||
updated_at = CURRENT_TIMESTAMP
|
||||
updated_at = :now
|
||||
SQL
|
||||
fragment_id: target.id,
|
||||
model_id: id,
|
||||
|
@ -483,6 +485,7 @@ module DiscourseAi
|
|||
strategy_version: @strategy.version,
|
||||
digest: digest,
|
||||
embeddings: vector,
|
||||
now: Time.zone.now,
|
||||
)
|
||||
else
|
||||
raise ArgumentError, "Invalid target type"
|
||||
|
|
|
@ -24,13 +24,14 @@ RSpec.describe Jobs::EmbeddingsBackfill do
|
|||
DiscourseAi::Embeddings::VectorRepresentations::Base.current_representation(strategy)
|
||||
end
|
||||
|
||||
it "backfills topics based on bumped_at date" do
|
||||
before do
|
||||
SiteSetting.ai_embeddings_enabled = true
|
||||
SiteSetting.ai_embeddings_discourse_service_api_endpoint = "http://test.com"
|
||||
SiteSetting.ai_embeddings_backfill_batch_size = 1
|
||||
|
||||
Jobs.run_immediately!
|
||||
end
|
||||
|
||||
it "backfills topics based on bumped_at date" do
|
||||
embedding = Array.new(1024) { 1 }
|
||||
|
||||
WebMock.stub_request(
|
||||
|
@ -51,5 +52,20 @@ RSpec.describe Jobs::EmbeddingsBackfill do
|
|||
topic_ids = DB.query_single("SELECT topic_id from #{vector_rep.topic_table_name}")
|
||||
|
||||
expect(topic_ids).to contain_exactly(first_topic.id, second_topic.id, third_topic.id)
|
||||
|
||||
freeze_time 1.day.from_now
|
||||
|
||||
# new title forces a reindex
|
||||
third_topic.update!(updated_at: Time.zone.now, title: "new title - 123")
|
||||
|
||||
Jobs::EmbeddingsBackfill.new.execute({})
|
||||
|
||||
index_date =
|
||||
DB.query_single(
|
||||
"SELECT updated_at from #{vector_rep.topic_table_name} WHERE topic_id = ?",
|
||||
third_topic.id,
|
||||
).first
|
||||
|
||||
expect(index_date).to be_within_one_second_of(Time.zone.now)
|
||||
end
|
||||
end
|
||||
|
|
Loading…
Reference in New Issue