discourse-ai/lib/ai_helper/semantic_categorizer.rb

# frozen_string_literal: true
module DiscourseAi
  module AiHelper
    class SemanticCategorizer
      def initialize(input, user)
        @user = user
        @text = input[:text]
      end

      def categories
        return [] if @text.blank?
        return [] unless SiteSetting.ai_embeddings_enabled

        candidates = nearest_neighbors(limit: 100)
        candidate_ids = candidates.map(&:first)

        ::Topic
          .joins(:category)
          .where(id: candidate_ids)
          .where("categories.id IN (?)", Category.topic_create_allowed(@user.guardian).pluck(:id))
          .order("array_position(ARRAY#{candidate_ids}, topics.id)")
          .pluck(
            "categories.id",
            "categories.name",
            "categories.slug",
            "categories.color",
            "categories.topic_count",
          )
          .map
          .with_index do |(id, name, slug, color, topic_count), index|
            {
              id: id,
              name: name,
              slug: slug,
              color: color,
              topicCount: topic_count,
              score: candidates[index].last,
            }
          end
          .map do |c|
            c[:score] = 1 / (c[:score] + 1) # inverse of the distance
            c
          end
          .group_by { |c| c[:name] }
          .map { |name, scores| scores.first.merge(score: scores.sum { |s| s[:score] }) }
          .sort_by { |c| -c[:score] }
          .take(5)
      end

      def tags
        return [] if @text.blank?
        return [] unless SiteSetting.ai_embeddings_enabled

        candidates = nearest_neighbors(limit: 100)
        candidate_ids = candidates.map(&:first)

        count_column = Tag.topic_count_column(@user.guardian) # Determine the count column

        ::Topic
          .joins(:topic_tags, :tags)
          .where(id: candidate_ids)
          .where("tags.id IN (?)", DiscourseTagging.visible_tags(@user.guardian).pluck(:id))
          .group("topics.id, tags.id, tags.name") # Group by topics.id and tags.id
          .order("array_position(ARRAY#{candidate_ids}, topics.id)")
          .pluck(
            "tags.id",
            "tags.name",
            "tags.#{count_column}",
            "MIN(array_position(ARRAY#{candidate_ids}, topics.id))", # Get minimum index for ordering
          )
          .uniq # Ensure unique tags per topic
          .map
          .with_index do |(id, name, count, index), idx|
            {
              id: id,
              name: name,
              count: count,
              score: 1 / (candidates[idx].last + 1), # Inverse of the distance for score
            }
          end
          .group_by { |tag| tag[:name] }
          .map do |name, tags|
            tags.first.merge(score: tags.sum { |t| t[:score] })
          end # Aggregate scores per tag
          .sort_by { |tag| -tag[:score] }
          .take(5)
      end

      private

      def nearest_neighbors(limit: 100)
        strategy = DiscourseAi::Embeddings::Strategies::Truncation.new
        vector_rep =
          DiscourseAi::Embeddings::VectorRepresentations::Base.current_representation(strategy)

        raw_vector = vector_rep.vector_from(@text)

        vector_rep.asymmetric_topics_similarity_search(
          raw_vector,
          limit: limit,
          offset: 0,
          return_distance: true,
        )
      end
    end
  end
end
FEATURE: Additional AI suggestion options (#176) 2023-09-01 20:10:58 -04:00			`# frozen_string_literal: true`
			`module DiscourseAi`
			`module AiHelper`
			`class SemanticCategorizer`
FIX: Suggest category/tag was broken since 2e5a393 (#237) 2023-10-02 15:36:56 -04:00			`def initialize(input, user)`
FEATURE: Return only applicable suggestions in AiHelper category/tags suggestions (#184) 2023-09-04 13:30:33 -04:00			`@user = user`
FIX: Suggest category/tag was broken since 2e5a393 (#237) 2023-10-02 15:36:56 -04:00			`@text = input[:text]`
FEATURE: Additional AI suggestion options (#176) 2023-09-01 20:10:58 -04:00			`end`

			`def categories`
			`return [] if @text.blank?`
			`return [] unless SiteSetting.ai_embeddings_enabled`

FIX: Tag/category suggestion broke in 2c0f535 (#198) 2023-09-05 13:15:01 -04:00			`candidates = nearest_neighbors(limit: 100)`
FEATURE: Additional AI suggestion options (#176) 2023-09-01 20:10:58 -04:00			`candidate_ids = candidates.map(&:first)`

			`::Topic`
			`.joins(:category)`
			`.where(id: candidate_ids)`
FEATURE: Return only applicable suggestions in AiHelper category/tags suggestions (#184) 2023-09-04 13:30:33 -04:00			`.where("categories.id IN (?)", Category.topic_create_allowed(@user.guardian).pluck(:id))`
FEATURE: Additional AI suggestion options (#176) 2023-09-01 20:10:58 -04:00			`.order("array_position(ARRAY#{candidate_ids}, topics.id)")`
REFACTOR: Helper suggestions (#914) This PR adds some updates to the Helper suggestions to improve it's functionality and modernize some of the codebase. 2024-11-27 15:21:03 -05:00			`.pluck(`
			`"categories.id",`
			`"categories.name",`
			`"categories.slug",`
			`"categories.color",`
			`"categories.topic_count",`
			`)`
FEATURE: Additional AI suggestion options (#176) 2023-09-01 20:10:58 -04:00			`.map`
REFACTOR: Helper suggestions (#914) This PR adds some updates to the Helper suggestions to improve it's functionality and modernize some of the codebase. 2024-11-27 15:21:03 -05:00			`.with_index do \|(id, name, slug, color, topic_count), index\|`
			`{`
			`id: id,`
			`name: name,`
			`slug: slug,`
			`color: color,`
			`topicCount: topic_count,`
			`score: candidates[index].last,`
			`}`
			`end`
FEATURE: Additional AI suggestion options (#176) 2023-09-01 20:10:58 -04:00			`.map do \|c\|`
			`c[:score] = 1 / (c[:score] + 1) # inverse of the distance`
			`c`
			`end`
			`.group_by { \|c\| c[:name] }`
REFACTOR: Helper suggestions (#914) This PR adds some updates to the Helper suggestions to improve it's functionality and modernize some of the codebase. 2024-11-27 15:21:03 -05:00			`.map { \|name, scores\| scores.first.merge(score: scores.sum { \|s\| s[:score] }) }`
FEATURE: Additional AI suggestion options (#176) 2023-09-01 20:10:58 -04:00			`.sort_by { \|c\| -c[:score] }`
			`.take(5)`
			`end`

			`def tags`
			`return [] if @text.blank?`
			`return [] unless SiteSetting.ai_embeddings_enabled`

FIX: Tag/category suggestion broke in 2c0f535 (#198) 2023-09-05 13:15:01 -04:00			`candidates = nearest_neighbors(limit: 100)`
FEATURE: Additional AI suggestion options (#176) 2023-09-01 20:10:58 -04:00			`candidate_ids = candidates.map(&:first)`

REFACTOR: Helper suggestions (#914) This PR adds some updates to the Helper suggestions to improve it's functionality and modernize some of the codebase. 2024-11-27 15:21:03 -05:00			`count_column = Tag.topic_count_column(@user.guardian) # Determine the count column`

FEATURE: Additional AI suggestion options (#176) 2023-09-01 20:10:58 -04:00			`::Topic`
			`.joins(:topic_tags, :tags)`
			`.where(id: candidate_ids)`
FEATURE: Return only applicable suggestions in AiHelper category/tags suggestions (#184) 2023-09-04 13:30:33 -04:00			`.where("tags.id IN (?)", DiscourseTagging.visible_tags(@user.guardian).pluck(:id))`
REFACTOR: Helper suggestions (#914) This PR adds some updates to the Helper suggestions to improve it's functionality and modernize some of the codebase. 2024-11-27 15:21:03 -05:00			`.group("topics.id, tags.id, tags.name") # Group by topics.id and tags.id`
FEATURE: Additional AI suggestion options (#176) 2023-09-01 20:10:58 -04:00			`.order("array_position(ARRAY#{candidate_ids}, topics.id)")`
REFACTOR: Helper suggestions (#914) This PR adds some updates to the Helper suggestions to improve it's functionality and modernize some of the codebase. 2024-11-27 15:21:03 -05:00			`.pluck(`
			`"tags.id",`
			`"tags.name",`
			`"tags.#{count_column}",`
			`"MIN(array_position(ARRAY#{candidate_ids}, topics.id))", # Get minimum index for ordering`
			`)`
			`.uniq # Ensure unique tags per topic`
FEATURE: Additional AI suggestion options (#176) 2023-09-01 20:10:58 -04:00			`.map`
REFACTOR: Helper suggestions (#914) This PR adds some updates to the Helper suggestions to improve it's functionality and modernize some of the codebase. 2024-11-27 15:21:03 -05:00			`.with_index do \|(id, name, count, index), idx\|`
			`{`
			`id: id,`
			`name: name,`
			`count: count,`
			`score: 1 / (candidates[idx].last + 1), # Inverse of the distance for score`
			`}`
FEATURE: Additional AI suggestion options (#176) 2023-09-01 20:10:58 -04:00			`end`
REFACTOR: Helper suggestions (#914) This PR adds some updates to the Helper suggestions to improve it's functionality and modernize some of the codebase. 2024-11-27 15:21:03 -05:00			`.group_by { \|tag\| tag[:name] }`
			`.map do \|name, tags\|`
			`tags.first.merge(score: tags.sum { \|t\| t[:score] })`
			`end # Aggregate scores per tag`
			`.sort_by { \|tag\| -tag[:score] }`
FEATURE: Additional AI suggestion options (#176) 2023-09-01 20:10:58 -04:00			`.take(5)`
			`end`
FIX: Tag/category suggestion broke in 2c0f535 (#198) 2023-09-05 13:15:01 -04:00
			`private`

			`def nearest_neighbors(limit: 100)`
			`strategy = DiscourseAi::Embeddings::Strategies::Truncation.new`
			`vector_rep =`
			`DiscourseAi::Embeddings::VectorRepresentations::Base.current_representation(strategy)`

			`raw_vector = vector_rep.vector_from(@text)`

			`vector_rep.asymmetric_topics_similarity_search(`
			`raw_vector,`
			`limit: limit,`
			`offset: 0,`
			`return_distance: true,`
			`)`
			`end`
FEATURE: Additional AI suggestion options (#176) 2023-09-01 20:10:58 -04:00			`end`
			`end`
			`end`