2023-03-31 14:29:56 -04:00
|
|
|
# frozen_string_literal: true
|
|
|
|
|
|
|
|
module DiscourseAi
|
|
|
|
module Embeddings
|
|
|
|
class SemanticSearch
|
2023-09-05 10:08:23 -04:00
|
|
|
def self.clear_cache_for(query)
|
|
|
|
digest = OpenSSL::Digest::SHA1.hexdigest(query)
|
|
|
|
|
2023-09-05 17:39:39 -04:00
|
|
|
hyde_key =
|
|
|
|
"semantic-search-#{digest}-#{SiteSetting.ai_embeddings_semantic_search_hyde_model}"
|
|
|
|
|
|
|
|
Discourse.cache.delete(hyde_key)
|
|
|
|
Discourse.cache.delete("#{hyde_key}-#{SiteSetting.ai_embeddings_model}")
|
2024-08-28 00:17:34 -04:00
|
|
|
Discourse.cache.delete("-#{SiteSetting.ai_embeddings_model}")
|
2023-09-05 10:08:23 -04:00
|
|
|
end
|
|
|
|
|
2023-07-13 11:41:36 -04:00
|
|
|
def initialize(guardian)
|
2023-03-31 14:29:56 -04:00
|
|
|
@guardian = guardian
|
2023-09-05 10:08:23 -04:00
|
|
|
end
|
|
|
|
|
|
|
|
def cached_query?(query)
|
|
|
|
digest = OpenSSL::Digest::SHA1.hexdigest(query)
|
2023-09-05 17:39:39 -04:00
|
|
|
embedding_key =
|
|
|
|
build_embedding_key(
|
|
|
|
digest,
|
|
|
|
SiteSetting.ai_embeddings_semantic_search_hyde_model,
|
|
|
|
SiteSetting.ai_embeddings_model,
|
|
|
|
)
|
|
|
|
|
|
|
|
Discourse.cache.read(embedding_key).present?
|
2023-03-31 14:29:56 -04:00
|
|
|
end
|
|
|
|
|
2024-12-16 07:55:39 -05:00
|
|
|
def vector
|
|
|
|
@vector ||= DiscourseAi::Embeddings::Vector.instance
|
2024-08-28 00:17:34 -04:00
|
|
|
end
|
2023-09-05 10:08:23 -04:00
|
|
|
|
2024-08-28 00:17:34 -04:00
|
|
|
def hyde_embedding(search_term)
|
2023-09-12 15:16:33 -04:00
|
|
|
digest = OpenSSL::Digest::SHA1.hexdigest(search_term)
|
2023-09-05 17:39:39 -04:00
|
|
|
hyde_key = build_hyde_key(digest, SiteSetting.ai_embeddings_semantic_search_hyde_model)
|
|
|
|
|
|
|
|
embedding_key =
|
|
|
|
build_embedding_key(
|
|
|
|
digest,
|
|
|
|
SiteSetting.ai_embeddings_semantic_search_hyde_model,
|
|
|
|
SiteSetting.ai_embeddings_model,
|
|
|
|
)
|
2023-03-31 14:29:56 -04:00
|
|
|
|
2023-09-05 10:08:23 -04:00
|
|
|
hypothetical_post =
|
|
|
|
Discourse
|
|
|
|
.cache
|
2023-11-23 10:58:54 -05:00
|
|
|
.fetch(hyde_key, expires_in: 1.week) { hypothetical_post_from(search_term) }
|
2023-09-05 10:08:23 -04:00
|
|
|
|
2024-08-28 00:17:34 -04:00
|
|
|
Discourse
|
|
|
|
.cache
|
2024-12-16 07:55:39 -05:00
|
|
|
.fetch(embedding_key, expires_in: 1.week) { vector.vector_from(hypothetical_post) }
|
2024-08-28 00:17:34 -04:00
|
|
|
end
|
|
|
|
|
|
|
|
def embedding(search_term)
|
|
|
|
digest = OpenSSL::Digest::SHA1.hexdigest(search_term)
|
|
|
|
embedding_key = build_embedding_key(digest, "", SiteSetting.ai_embeddings_model)
|
|
|
|
|
2024-12-16 07:55:39 -05:00
|
|
|
Discourse.cache.fetch(embedding_key, expires_in: 1.week) { vector.vector_from(search_term) }
|
2024-08-28 00:17:34 -04:00
|
|
|
end
|
|
|
|
|
2024-08-30 00:37:55 -04:00
|
|
|
# this ensures the candidate topics are over selected
|
|
|
|
# that way we have a much better chance of finding topics
|
|
|
|
# if the user filtered the results or index is a bit out of date
|
|
|
|
OVER_SELECTION_FACTOR = 4
|
|
|
|
|
2024-08-28 00:17:34 -04:00
|
|
|
def search_for_topics(query, page = 1, hyde: true)
|
|
|
|
max_results_per_page = 100
|
|
|
|
limit = [Search.per_filter, max_results_per_page].min + 1
|
|
|
|
offset = (page - 1) * limit
|
|
|
|
search = Search.new(query, { guardian: guardian })
|
|
|
|
search_term = search.term
|
|
|
|
|
2024-10-24 15:24:53 -04:00
|
|
|
if search_term.blank? || search_term.length < SiteSetting.min_search_term_length
|
|
|
|
return Post.none
|
|
|
|
end
|
2024-08-28 00:17:34 -04:00
|
|
|
|
|
|
|
search_embedding = hyde ? hyde_embedding(search_term) : embedding(search_term)
|
2023-09-05 10:08:23 -04:00
|
|
|
|
2024-08-30 00:37:55 -04:00
|
|
|
over_selection_limit = limit * OVER_SELECTION_FACTOR
|
|
|
|
|
2024-12-16 07:55:39 -05:00
|
|
|
schema = DiscourseAi::Embeddings::Schema.for(Topic, vector_def: vector.vdef)
|
2024-12-13 08:15:21 -05:00
|
|
|
|
2023-09-05 10:08:23 -04:00
|
|
|
candidate_topic_ids =
|
2024-12-13 08:15:21 -05:00
|
|
|
schema.asymmetric_similarity_search(
|
2024-08-28 00:17:34 -04:00
|
|
|
search_embedding,
|
2024-08-30 00:37:55 -04:00
|
|
|
limit: over_selection_limit,
|
2023-09-05 10:08:23 -04:00
|
|
|
offset: offset,
|
2024-12-13 08:15:21 -05:00
|
|
|
).map(&:topic_id)
|
2023-03-31 14:29:56 -04:00
|
|
|
|
2023-09-06 09:00:20 -04:00
|
|
|
semantic_results =
|
|
|
|
::Post
|
|
|
|
.where(post_type: ::Topic.visible_post_types(guardian.user))
|
|
|
|
.public_posts
|
|
|
|
.where("topics.visible")
|
|
|
|
.where(topic_id: candidate_topic_ids, post_number: 1)
|
2024-01-30 13:50:26 -05:00
|
|
|
.order("array_position(ARRAY#{candidate_topic_ids}, posts.topic_id)")
|
2024-08-30 00:37:55 -04:00
|
|
|
.limit(limit)
|
2023-09-06 09:00:20 -04:00
|
|
|
|
2023-09-12 15:16:33 -04:00
|
|
|
query_filter_results = search.apply_filters(semantic_results)
|
|
|
|
|
|
|
|
guardian.filter_allowed_categories(query_filter_results)
|
2023-07-13 11:41:36 -04:00
|
|
|
end
|
|
|
|
|
2024-03-08 11:02:50 -05:00
|
|
|
def quick_search(query)
|
|
|
|
max_semantic_results_per_page = 100
|
|
|
|
search = Search.new(query, { guardian: guardian })
|
|
|
|
search_term = search.term
|
|
|
|
|
|
|
|
return [] if search_term.nil? || search_term.length < SiteSetting.min_search_term_length
|
|
|
|
|
2024-12-16 07:55:39 -05:00
|
|
|
vector = DiscourseAi::Embeddings::Vector.instance
|
2024-03-08 11:02:50 -05:00
|
|
|
|
|
|
|
digest = OpenSSL::Digest::SHA1.hexdigest(search_term)
|
|
|
|
|
|
|
|
embedding_key =
|
|
|
|
build_embedding_key(
|
|
|
|
digest,
|
|
|
|
SiteSetting.ai_embeddings_semantic_search_hyde_model,
|
|
|
|
SiteSetting.ai_embeddings_model,
|
|
|
|
)
|
|
|
|
|
|
|
|
search_term_embedding =
|
|
|
|
Discourse
|
|
|
|
.cache
|
|
|
|
.fetch(embedding_key, expires_in: 1.week) do
|
2024-12-16 07:55:39 -05:00
|
|
|
vector.vector_from(search_term, asymetric: true)
|
2024-03-08 11:02:50 -05:00
|
|
|
end
|
|
|
|
|
|
|
|
candidate_post_ids =
|
2024-12-13 08:15:21 -05:00
|
|
|
DiscourseAi::Embeddings::Schema
|
2024-12-16 07:55:39 -05:00
|
|
|
.for(Post, vector_def: vector.vdef)
|
2024-12-13 08:15:21 -05:00
|
|
|
.asymmetric_similarity_search(
|
|
|
|
search_term_embedding,
|
|
|
|
limit: max_semantic_results_per_page,
|
|
|
|
offset: 0,
|
|
|
|
)
|
|
|
|
.map(&:post_id)
|
2024-03-08 11:02:50 -05:00
|
|
|
|
|
|
|
semantic_results =
|
|
|
|
::Post
|
|
|
|
.where(post_type: ::Topic.visible_post_types(guardian.user))
|
|
|
|
.public_posts
|
|
|
|
.where("topics.visible")
|
|
|
|
.where(id: candidate_post_ids)
|
|
|
|
.order("array_position(ARRAY#{candidate_post_ids}, posts.id)")
|
|
|
|
|
|
|
|
filtered_results = search.apply_filters(semantic_results)
|
|
|
|
|
|
|
|
rerank_posts_payload =
|
|
|
|
filtered_results
|
|
|
|
.map(&:cooked)
|
|
|
|
.map { Nokogiri::HTML5.fragment(_1).text }
|
|
|
|
.map { _1.truncate(2000, omission: "") }
|
|
|
|
|
|
|
|
reranked_results =
|
|
|
|
DiscourseAi::Inference::HuggingFaceTextEmbeddings.rerank(
|
|
|
|
search_term,
|
|
|
|
rerank_posts_payload,
|
|
|
|
)
|
|
|
|
|
|
|
|
reordered_ids = reranked_results.map { _1[:index] }.map { filtered_results[_1].id }.take(5)
|
|
|
|
|
|
|
|
reranked_semantic_results =
|
|
|
|
::Post
|
|
|
|
.where(post_type: ::Topic.visible_post_types(guardian.user))
|
|
|
|
.public_posts
|
|
|
|
.where("topics.visible")
|
|
|
|
.where(id: reordered_ids)
|
|
|
|
.order("array_position(ARRAY#{reordered_ids}, posts.id)")
|
|
|
|
|
|
|
|
guardian.filter_allowed_categories(reranked_semantic_results)
|
|
|
|
end
|
|
|
|
|
2023-11-23 10:58:54 -05:00
|
|
|
def hypothetical_post_from(search_term)
|
2024-01-12 12:36:44 -05:00
|
|
|
prompt = DiscourseAi::Completions::Prompt.new(<<~TEXT.strip)
|
2023-11-23 10:58:54 -05:00
|
|
|
You are a content creator for a forum. The forum description is as follows:
|
|
|
|
#{SiteSetting.title}
|
|
|
|
#{SiteSetting.site_description}
|
2024-01-12 12:36:44 -05:00
|
|
|
|
|
|
|
Put the forum post between <ai></ai> tags.
|
2023-11-23 10:58:54 -05:00
|
|
|
TEXT
|
2024-01-12 12:36:44 -05:00
|
|
|
|
|
|
|
prompt.push(type: :user, content: <<~TEXT.strip)
|
2023-11-23 10:58:54 -05:00
|
|
|
Using this description, write a forum post about the subject inside the <input></input> XML tags:
|
2023-11-28 23:17:46 -05:00
|
|
|
|
2023-11-23 10:58:54 -05:00
|
|
|
<input>#{search_term}</input>
|
|
|
|
TEXT
|
|
|
|
|
|
|
|
llm_response =
|
2023-11-28 23:17:46 -05:00
|
|
|
DiscourseAi::Completions::Llm.proxy(
|
2023-11-23 10:58:54 -05:00
|
|
|
SiteSetting.ai_embeddings_semantic_search_hyde_model,
|
2024-05-13 23:28:46 -04:00
|
|
|
).generate(prompt, user: @guardian.user, feature_name: "semantic_search_hyde")
|
2023-11-23 10:58:54 -05:00
|
|
|
|
2024-05-28 05:15:42 -04:00
|
|
|
Nokogiri::HTML5.fragment(llm_response).at("ai")&.text.presence || llm_response
|
2023-11-23 10:58:54 -05:00
|
|
|
end
|
2024-05-07 14:17:26 -04:00
|
|
|
|
|
|
|
private
|
|
|
|
|
|
|
|
attr_reader :guardian
|
|
|
|
|
|
|
|
def build_hyde_key(digest, hyde_model)
|
|
|
|
"semantic-search-#{digest}-#{hyde_model}"
|
|
|
|
end
|
|
|
|
|
|
|
|
def build_embedding_key(digest, hyde_model, embedding_model)
|
|
|
|
"#{build_hyde_key(digest, hyde_model)}-#{embedding_model}"
|
|
|
|
end
|
2023-03-31 14:29:56 -04:00
|
|
|
end
|
|
|
|
end
|
|
|
|
end
|