2023-09-01 20:10:58 -04:00
|
|
|
# frozen_string_literal: true
|
|
|
|
module DiscourseAi
|
|
|
|
module AiHelper
|
|
|
|
class SemanticCategorizer
|
2023-10-02 15:36:56 -04:00
|
|
|
def initialize(input, user)
|
2023-09-04 13:30:33 -04:00
|
|
|
@user = user
|
2023-10-02 15:36:56 -04:00
|
|
|
@text = input[:text]
|
2023-09-01 20:10:58 -04:00
|
|
|
end
|
|
|
|
|
|
|
|
def categories
|
|
|
|
return [] if @text.blank?
|
|
|
|
return [] unless SiteSetting.ai_embeddings_enabled
|
|
|
|
|
2023-09-05 13:15:01 -04:00
|
|
|
candidates = nearest_neighbors(limit: 100)
|
2023-09-01 20:10:58 -04:00
|
|
|
candidate_ids = candidates.map(&:first)
|
|
|
|
|
|
|
|
::Topic
|
|
|
|
.joins(:category)
|
|
|
|
.where(id: candidate_ids)
|
2023-09-04 13:30:33 -04:00
|
|
|
.where("categories.id IN (?)", Category.topic_create_allowed(@user.guardian).pluck(:id))
|
2023-09-01 20:10:58 -04:00
|
|
|
.order("array_position(ARRAY#{candidate_ids}, topics.id)")
|
2024-11-27 15:21:03 -05:00
|
|
|
.pluck(
|
|
|
|
"categories.id",
|
|
|
|
"categories.name",
|
|
|
|
"categories.slug",
|
|
|
|
"categories.color",
|
|
|
|
"categories.topic_count",
|
|
|
|
)
|
2023-09-01 20:10:58 -04:00
|
|
|
.map
|
2024-11-27 15:21:03 -05:00
|
|
|
.with_index do |(id, name, slug, color, topic_count), index|
|
|
|
|
{
|
|
|
|
id: id,
|
|
|
|
name: name,
|
|
|
|
slug: slug,
|
|
|
|
color: color,
|
|
|
|
topicCount: topic_count,
|
|
|
|
score: candidates[index].last,
|
|
|
|
}
|
|
|
|
end
|
2023-09-01 20:10:58 -04:00
|
|
|
.map do |c|
|
|
|
|
c[:score] = 1 / (c[:score] + 1) # inverse of the distance
|
|
|
|
c
|
|
|
|
end
|
|
|
|
.group_by { |c| c[:name] }
|
2024-11-27 15:21:03 -05:00
|
|
|
.map { |name, scores| scores.first.merge(score: scores.sum { |s| s[:score] }) }
|
2023-09-01 20:10:58 -04:00
|
|
|
.sort_by { |c| -c[:score] }
|
|
|
|
.take(5)
|
|
|
|
end
|
|
|
|
|
|
|
|
def tags
|
|
|
|
return [] if @text.blank?
|
|
|
|
return [] unless SiteSetting.ai_embeddings_enabled
|
|
|
|
|
2023-09-05 13:15:01 -04:00
|
|
|
candidates = nearest_neighbors(limit: 100)
|
2023-09-01 20:10:58 -04:00
|
|
|
candidate_ids = candidates.map(&:first)
|
|
|
|
|
2024-11-27 15:21:03 -05:00
|
|
|
count_column = Tag.topic_count_column(@user.guardian) # Determine the count column
|
|
|
|
|
2023-09-01 20:10:58 -04:00
|
|
|
::Topic
|
|
|
|
.joins(:topic_tags, :tags)
|
|
|
|
.where(id: candidate_ids)
|
2023-09-04 13:30:33 -04:00
|
|
|
.where("tags.id IN (?)", DiscourseTagging.visible_tags(@user.guardian).pluck(:id))
|
2024-11-27 15:21:03 -05:00
|
|
|
.group("topics.id, tags.id, tags.name") # Group by topics.id and tags.id
|
2023-09-01 20:10:58 -04:00
|
|
|
.order("array_position(ARRAY#{candidate_ids}, topics.id)")
|
2024-11-27 15:21:03 -05:00
|
|
|
.pluck(
|
|
|
|
"tags.id",
|
|
|
|
"tags.name",
|
|
|
|
"tags.#{count_column}",
|
|
|
|
"MIN(array_position(ARRAY#{candidate_ids}, topics.id))", # Get minimum index for ordering
|
|
|
|
)
|
|
|
|
.uniq # Ensure unique tags per topic
|
2023-09-01 20:10:58 -04:00
|
|
|
.map
|
2024-11-27 15:21:03 -05:00
|
|
|
.with_index do |(id, name, count, index), idx|
|
|
|
|
{
|
|
|
|
id: id,
|
|
|
|
name: name,
|
|
|
|
count: count,
|
|
|
|
score: 1 / (candidates[idx].last + 1), # Inverse of the distance for score
|
|
|
|
}
|
2023-09-01 20:10:58 -04:00
|
|
|
end
|
2024-11-27 15:21:03 -05:00
|
|
|
.group_by { |tag| tag[:name] }
|
|
|
|
.map do |name, tags|
|
|
|
|
tags.first.merge(score: tags.sum { |t| t[:score] })
|
|
|
|
end # Aggregate scores per tag
|
|
|
|
.sort_by { |tag| -tag[:score] }
|
2023-09-01 20:10:58 -04:00
|
|
|
.take(5)
|
|
|
|
end
|
2023-09-05 13:15:01 -04:00
|
|
|
|
|
|
|
private
|
|
|
|
|
|
|
|
def nearest_neighbors(limit: 100)
|
|
|
|
strategy = DiscourseAi::Embeddings::Strategies::Truncation.new
|
|
|
|
vector_rep =
|
|
|
|
DiscourseAi::Embeddings::VectorRepresentations::Base.current_representation(strategy)
|
|
|
|
|
|
|
|
raw_vector = vector_rep.vector_from(@text)
|
|
|
|
|
|
|
|
vector_rep.asymmetric_topics_similarity_search(
|
|
|
|
raw_vector,
|
|
|
|
limit: limit,
|
|
|
|
offset: 0,
|
|
|
|
return_distance: true,
|
|
|
|
)
|
|
|
|
end
|
2023-09-01 20:10:58 -04:00
|
|
|
end
|
|
|
|
end
|
|
|
|
end
|