mirror of
				https://github.com/discourse/discourse-ai.git
				synced 2025-11-04 08:28:46 +00:00 
			
		
		
		
	* FEATURE: HyDE-powered semantic search. It relies on the new outlet added on discourse/discourse#23390 to display semantic search results in an unobtrusive way. We'll use a HyDE-backed approach for semantic search, which consists on generating an hypothetical document from a given keywords, which gets transformed into a vector and used in a asymmetric similarity topic search. This PR also reorganizes the internals to have less moving parts, maintaining one hierarchy of DAOish classes for vector-related operations like transformations and querying. Completions and vectors created by HyDE will remain cached on Redis for now, but we could later use Postgres instead. * Missing translation and rate limiting --------- Co-authored-by: Roman Rizzi <rizziromanalejandro@gmail.com>
		
			
				
	
	
		
			36 lines
		
	
	
		
			1.1 KiB
		
	
	
	
		
			Ruby
		
	
	
	
	
	
			
		
		
	
	
			36 lines
		
	
	
		
			1.1 KiB
		
	
	
	
		
			Ruby
		
	
	
	
	
	
# frozen_string_literal: true
 | 
						|
 | 
						|
RSpec.describe DiscourseAi::Embeddings::Strategies::Truncation do
 | 
						|
  subject(:truncation) { described_class.new }
 | 
						|
 | 
						|
  describe "#prepare_text_from" do
 | 
						|
    context "when using vector from OpenAI" do
 | 
						|
      before { SiteSetting.max_post_length = 100_000 }
 | 
						|
 | 
						|
      fab!(:topic) { Fabricate(:topic) }
 | 
						|
      fab!(:post) do
 | 
						|
        Fabricate(:post, topic: topic, raw: "Baby, bird, bird, bird\nBird is the word\n" * 500)
 | 
						|
      end
 | 
						|
      fab!(:post) do
 | 
						|
        Fabricate(
 | 
						|
          :post,
 | 
						|
          topic: topic,
 | 
						|
          raw: "Don't you know about the bird?\nEverybody knows that the bird is a word\n" * 400,
 | 
						|
        )
 | 
						|
      end
 | 
						|
      fab!(:post) { Fabricate(:post, topic: topic, raw: "Surfin' bird\n" * 800) }
 | 
						|
 | 
						|
      let(:model) do
 | 
						|
        DiscourseAi::Embeddings::VectorRepresentations::TextEmbeddingAda002.new(truncation)
 | 
						|
      end
 | 
						|
 | 
						|
      it "truncates a topic" do
 | 
						|
        prepared_text =
 | 
						|
          truncation.prepare_text_from(topic, model.tokenizer, model.max_sequence_length)
 | 
						|
 | 
						|
        expect(model.tokenizer.size(prepared_text)).to be <= model.max_sequence_length
 | 
						|
      end
 | 
						|
    end
 | 
						|
  end
 | 
						|
end
 |