mirror of
https://github.com/discourse/discourse-ai.git
synced 2025-10-29 13:38:37 +00:00
* FEATURE: HyDE-powered semantic search. It relies on the new outlet added on discourse/discourse#23390 to display semantic search results in an unobtrusive way. We'll use a HyDE-backed approach for semantic search, which consists on generating an hypothetical document from a given keywords, which gets transformed into a vector and used in a asymmetric similarity topic search. This PR also reorganizes the internals to have less moving parts, maintaining one hierarchy of DAOish classes for vector-related operations like transformations and querying. Completions and vectors created by HyDE will remain cached on Redis for now, but we could later use Postgres instead. * Missing translation and rate limiting --------- Co-authored-by: Roman Rizzi <rizziromanalejandro@gmail.com>
76 lines
2.5 KiB
Ruby
76 lines
2.5 KiB
Ruby
# frozen_string_literal: true
|
|
module DiscourseAi
|
|
module AiHelper
|
|
class SemanticCategorizer
|
|
def initialize(text, user)
|
|
@user = user
|
|
@text = text
|
|
end
|
|
|
|
def categories
|
|
return [] if @text.blank?
|
|
return [] unless SiteSetting.ai_embeddings_enabled
|
|
|
|
strategy = DiscourseAi::Embeddings::Strategies::Truncation.new
|
|
vector_rep =
|
|
DiscourseAi::Embeddings::VectorRepresentations::Base.current_representation(strategy)
|
|
|
|
candidates =
|
|
vector_rep.asymmetric_semantic_search(@text, limit: 100, offset: 0, return_distance: true)
|
|
candidate_ids = candidates.map(&:first)
|
|
|
|
::Topic
|
|
.joins(:category)
|
|
.where(id: candidate_ids)
|
|
.where("categories.id IN (?)", Category.topic_create_allowed(@user.guardian).pluck(:id))
|
|
.order("array_position(ARRAY#{candidate_ids}, topics.id)")
|
|
.pluck("categories.slug")
|
|
.map
|
|
.with_index { |category, index| { name: category, score: candidates[index].last } }
|
|
.map do |c|
|
|
c[:score] = 1 / (c[:score] + 1) # inverse of the distance
|
|
c
|
|
end
|
|
.group_by { |c| c[:name] }
|
|
.map { |name, scores| { name: name, score: scores.sum { |s| s[:score] } } }
|
|
.sort_by { |c| -c[:score] }
|
|
.take(5)
|
|
end
|
|
|
|
def tags
|
|
return [] if @text.blank?
|
|
return [] unless SiteSetting.ai_embeddings_enabled
|
|
|
|
candidates =
|
|
::DiscourseAi::Embeddings::SemanticSearch.new(nil).asymmetric_semantic_search(
|
|
@text,
|
|
100,
|
|
0,
|
|
return_distance: true,
|
|
)
|
|
candidate_ids = candidates.map(&:first)
|
|
|
|
::Topic
|
|
.joins(:topic_tags, :tags)
|
|
.where(id: candidate_ids)
|
|
.where("tags.id IN (?)", DiscourseTagging.visible_tags(@user.guardian).pluck(:id))
|
|
.group("topics.id")
|
|
.order("array_position(ARRAY#{candidate_ids}, topics.id)")
|
|
.pluck("array_agg(tags.name)")
|
|
.map(&:uniq)
|
|
.map
|
|
.with_index { |tag_list, index| { tags: tag_list, score: candidates[index].last } }
|
|
.flat_map { |c| c[:tags].map { |t| { name: t, score: c[:score] } } }
|
|
.map do |c|
|
|
c[:score] = 1 / (c[:score] + 1) # inverse of the distance
|
|
c
|
|
end
|
|
.group_by { |c| c[:name] }
|
|
.map { |name, scores| { name: name, score: scores.sum { |s| s[:score] } } }
|
|
.sort_by { |c| -c[:score] }
|
|
.take(5)
|
|
end
|
|
end
|
|
end
|
|
end
|