mirror of
				https://github.com/discourse/discourse-ai.git
				synced 2025-11-04 08:28:46 +00:00 
			
		
		
		
	* FEATURE: HyDE-powered semantic search. It relies on the new outlet added on discourse/discourse#23390 to display semantic search results in an unobtrusive way. We'll use a HyDE-backed approach for semantic search, which consists on generating an hypothetical document from a given keywords, which gets transformed into a vector and used in a asymmetric similarity topic search. This PR also reorganizes the internals to have less moving parts, maintaining one hierarchy of DAOish classes for vector-related operations like transformations and querying. Completions and vectors created by HyDE will remain cached on Redis for now, but we could later use Postgres instead. * Missing translation and rate limiting --------- Co-authored-by: Roman Rizzi <rizziromanalejandro@gmail.com>
		
			
				
	
	
		
			63 lines
		
	
	
		
			1.5 KiB
		
	
	
	
		
			Ruby
		
	
	
	
	
	
			
		
		
	
	
			63 lines
		
	
	
		
			1.5 KiB
		
	
	
	
		
			Ruby
		
	
	
	
	
	
# frozen_string_literal: true
 | 
						|
 | 
						|
module DiscourseAi
 | 
						|
  module Embeddings
 | 
						|
    module Strategies
 | 
						|
      class Truncation
 | 
						|
        def id
 | 
						|
          1
 | 
						|
        end
 | 
						|
 | 
						|
        def version
 | 
						|
          1
 | 
						|
        end
 | 
						|
 | 
						|
        def prepare_text_from(target, tokenizer, max_length)
 | 
						|
          case target
 | 
						|
          when Topic
 | 
						|
            topic_truncation(target, tokenizer, max_length)
 | 
						|
          when Post
 | 
						|
            post_truncation(target, tokenizer, max_length)
 | 
						|
          else
 | 
						|
            raise ArgumentError, "Invalid target type"
 | 
						|
          end
 | 
						|
        end
 | 
						|
 | 
						|
        private
 | 
						|
 | 
						|
        def topic_information(topic)
 | 
						|
          info = +""
 | 
						|
 | 
						|
          info << topic.title
 | 
						|
          info << "\n\n"
 | 
						|
          info << topic.category.name
 | 
						|
          if SiteSetting.tagging_enabled
 | 
						|
            info << "\n\n"
 | 
						|
            info << topic.tags.pluck(:name).join(", ")
 | 
						|
          end
 | 
						|
          info << "\n\n"
 | 
						|
        end
 | 
						|
 | 
						|
        def topic_truncation(topic, tokenizer, max_length)
 | 
						|
          text = +topic_information(topic)
 | 
						|
 | 
						|
          topic.posts.find_each do |post|
 | 
						|
            text << post.raw
 | 
						|
            break if tokenizer.size(text) >= max_length #maybe keep a partial counter to speed this up?
 | 
						|
            text << "\n\n"
 | 
						|
          end
 | 
						|
 | 
						|
          tokenizer.truncate(text, max_length)
 | 
						|
        end
 | 
						|
 | 
						|
        def post_truncation(topic, tokenizer, max_length)
 | 
						|
          text = +topic_information(post.topic)
 | 
						|
          text << post.raw
 | 
						|
 | 
						|
          tokenizer.truncate(text, max_length)
 | 
						|
        end
 | 
						|
      end
 | 
						|
    end
 | 
						|
  end
 | 
						|
end
 |